from:"Rémi Denis\-Courmont"

[FFmpeg-devel] [PATCH] checkasm/lpc: test compute_autocorr

2023-12-11 Thread Rémi Denis-Courmont

---
 tests/checkasm/lpc.c | 36 
 1 file changed, 36 insertions(+)

diff --git a/tests/checkasm/lpc.c b/tests/checkasm/lpc.c
index 592e34c03d..8e92a9e1b4 100644
--- a/tests/checkasm/lpc.c
+++ b/tests/checkasm/lpc.c
@@ -57,10 +57,40 @@ static void test_window(int len)
 bench_new(src, len, dst1);
 }
 
+static void test_compute_autocorr(int lag)
+{
+LOCAL_ALIGNED(16, double, src, [5000]);
+LOCAL_ALIGNED(16, double, dst0, [32]);
+LOCAL_ALIGNED(16, double, dst1, [32]);
+const size_t len = 5000;
+
+declare_func(void, const double *in, ptrdiff_t len, int lag, double *out);
+
+for (size_t i = 0; i < len; i++) {
+src[i] = (double)rnd() / (double)UINT_MAX;
+}
+
+call_ref(src, len, lag, dst0);
+call_new(src, len, lag, dst1);
+
+for (size_t i = 0; i < lag; i++) {
+if (!double_near_abs_eps(dst0[i], dst1[i], EPS)) {
+fprintf(stderr, "%zu: %- .12f - %- .12f = % .12g\n",
+i, dst0[i], dst1[i], dst0[i] - dst1[i]);
+fail();
+break;
+}
+}
+
+bench_new(src, len, lag, dst1);
+}
+
 void checkasm_check_lpc(void)
 {
 LPCContext ctx;
 int len = rnd() % 5000;
+static const int lags[] = { 10, 30, 32 };
+
 ff_lpc_init(, 32, 16, FF_LPC_TYPE_DEFAULT);
 
 if (check_func(ctx.lpc_apply_welch_window, "apply_welch_window_even")) {
@@ -73,5 +103,11 @@ void checkasm_check_lpc(void)
 }
 report("apply_welch_window_odd");
 
+for (size_t i = 0; i < FF_ARRAY_ELEMS(lags); i++) {
+if (check_func(ctx.lpc_compute_autocorr, "autocorr_%d", lags[i]))
+test_compute_autocorr(lags[i]);
+report("compute_autocorr_%d", lags[i]);
+}
+
 ff_lpc_end();
 }
-- 
2.43.0

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH] checkasm/lpc: test compute_autocorr

2023-12-11 Thread Rémi Denis-Courmont

Le maanantaina 11. joulukuuta 2023, 22.41.03 EET Rémi Denis-Courmont a écrit :
> ---
>  tests/checkasm/lpc.c | 36 
>  1 file changed, 36 insertions(+)
> 
> diff --git a/tests/checkasm/lpc.c b/tests/checkasm/lpc.c
> index 592e34c03d..8e92a9e1b4 100644
> --- a/tests/checkasm/lpc.c
> +++ b/tests/checkasm/lpc.c
> @@ -57,10 +57,40 @@ static void test_window(int len)
>  bench_new(src, len, dst1);
>  }
> 
> +static void test_compute_autocorr(int lag)
> +{
> +LOCAL_ALIGNED(16, double, src, [5000]);
> +LOCAL_ALIGNED(16, double, dst0, [32]);
> +LOCAL_ALIGNED(16, double, dst1, [32]);
> +const size_t len = 5000;
> +
> +declare_func(void, const double *in, ptrdiff_t len, int lag, double
> *out); +
> +for (size_t i = 0; i < len; i++) {
> +src[i] = (double)rnd() / (double)UINT_MAX;

Not sure if we should test negative numbers here.

> +}
> +
> +call_ref(src, len, lag, dst0);
> +call_new(src, len, lag, dst1);

Presumably src needs to be offset by one element, as the first iteration of the 
loop reads at offset minus one (in C code: sum1 += ...).

> +
> +for (size_t i = 0; i < lag; i++) {
> +if (!double_near_abs_eps(dst0[i], dst1[i], EPS)) {
> +fprintf(stderr, "%zu: %- .12f - %- .12f = % .12g\n",
> +i, dst0[i], dst1[i], dst0[i] - dst1[i]);
> +fail();
> +break;
> +}
> +}
> +
> +bench_new(src, len, lag, dst1);
> +}
> +
>  void checkasm_check_lpc(void)
>  {
>  LPCContext ctx;
>  int len = rnd() % 5000;
> +static const int lags[] = { 10, 30, 32 };
> +
>  ff_lpc_init(, 32, 16, FF_LPC_TYPE_DEFAULT);
> 
>  if (check_func(ctx.lpc_apply_welch_window, "apply_welch_window_even"))
> { @@ -73,5 +103,11 @@ void checkasm_check_lpc(void)
>  }
>  report("apply_welch_window_odd");
> 
> +for (size_t i = 0; i < FF_ARRAY_ELEMS(lags); i++) {
> +if (check_func(ctx.lpc_compute_autocorr, "autocorr_%d", lags[i]))
> +test_compute_autocorr(lags[i]);
> +report("compute_autocorr_%d", lags[i]);
> +}
> +
>  ff_lpc_end();
>  }


-- 
雷米‧德尼-库尔蒙
http://www.remlab.net/



___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH 3/3] lavc/svq1enc: R-V V ssd_int8_vs_int16

2023-12-30 Thread Rémi Denis-Courmont



Le 29 décembre 2023 12:57:20 GMT+01:00, flow gg  a écrit :
>C908
>ssd_int8_vs_int16_c: 207.7
>ssd_int8_vs_int16_rvv_i32: 28.0

At a quick glance, it won't work if the input length is not a multiple of the 
vector length.

Also do you really need to extend accumulators to 32 bits?
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH 3/3] lavc/svq1enc: R-V V ssd_int8_vs_int16

2023-12-30 Thread Rémi Denis-Courmont



Le 30 décembre 2023 15:00:53 GMT+01:00, flow gg  a écrit :
>> At a quick glance, it won't work if the input length is not a multiple of
>the vector length.
>
>Why?

You're not handling tails as far as I see.

> I tried 1024, 32*3, 32*7 and all passed the test.

They're all multiples of the vector length.

>> Also do you really need to extend accumulators to 32 bits?
>
>It won't overflow after the test is changed, so it's not needed anymore.
>I have modified it in this reply.
>
>Rémi Denis-Courmont  于2023年12月30日周六 20:15写道：
>
>>
>>
>> Le 29 décembre 2023 12:57:20 GMT+01:00, flow gg  a
>> écrit :
>> >C908
>> >ssd_int8_vs_int16_c: 207.7
>> >ssd_int8_vs_int16_rvv_i32: 28.0
>>
>> At a quick glance, it won't work if the input length is not a multiple of
>> the vector length.
>>
>> Also do you really need to extend accumulators to 32 bits?
>> ___
>> ffmpeg-devel mailing list
>> ffmpeg-devel@ffmpeg.org
>> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>>
>> To unsubscribe, visit link above, or email
>> ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
>>
>___
>ffmpeg-devel mailing list
>ffmpeg-devel@ffmpeg.org
>https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
>To unsubscribe, visit link above, or email
>ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH 2/3] checkasm/svqenc: add ssd_int8_vs_int16 test

2023-12-30 Thread Rémi Denis-Courmont



Le 29 décembre 2023 12:57:01 GMT+01:00, flow gg  a écrit :
>Tests on x86 might fail, possibly due to a 16-bit sub overflow

I don't know anything about the SVQ encoder. Still, especially for an encoder, 
overflows are probably not expected. So then it is as Martin wrote.
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH] fate: Allow running multiple rounds of tests with differing settings

2023-12-13 Thread Rémi Denis-Courmont

Le tiistaina 12. joulukuuta 2023, 0.14.06 EET Martin Storsjö a écrit :
> This can be used to run tests multple times, with e.g. differing
> QEMU settings, by adding something like this to the FATE configuration
> file:
> 
> target_exec="qemu-aarch64-static"
> fate_targets="fate-checkasm fate-cpu"
> 
> fate_environments="sve128 sve256 sve512"
> sve128_env="QEMU_CPU=max,sve128=on"
> sve256_env="QEMU_CPU=max,sve256=on"
> sve512_env="QEMU_CPU=max,sve512=on"

I'm fine with that, but for the sake of generality, shouldn't rather the entire 
target_exec prefix be indirected? Some runners may want to use command line 
flags rather than environment variables.


-- 
Rémi Denis-Courmont
http://www.remlab.net/



___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH 3/3] lavc/svq1enc: R-V V ssd_int8_vs_int16

2024-01-04 Thread Rémi Denis-Courmont

Le lauantaina 30. joulukuuta 2023, 18.20.15 EET flow gg a écrit :
> I mistook it, seeing the vector length as the length of the vector register
> ..
> I have modified it in this reply.

Setting element size to 8-bit is unnecessary, and a widening subtraction can 
presumably avoid the sign extension.

-- 
レミ・デニ-クールモン
http://www.remlab.net/



___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] Hardware purchase request: AVX512-capable laptop

2024-01-05 Thread Rémi Denis-Courmont

Le keskiviikkona 3. tammikuuta 2024, 2.56.12 EET Lynne a écrit :
> As some of you know, my laptop died nearly 2 years ago, and
> I've been working on a desktop machine, which is currently a Zen 3.
> AVX512 has become more popular in the meantime, with Zen 4
> and future AMD CPUs shipping with it, but currently, we have very
> little AVX512.

Frankly, generally speaking, I don't think it makes sense to buy laptops for 
development *unless* desktop systems are not an option.

And here, a desktop system is not only an option, but it is the technically 
better and already purchased option. A desktop is cheaper, more faster, more 
serviceable and more incrementally upgradeable. More prosaically a desktop 
system is much more suitable to occupational well-being - laptops are awfully 
inadequate in terms of ergonomy, unless they are docked, at which point they 
become expensive under-provisioned desktops.

A laptop would of course be necessary whilst spending extended periods of your 
time away from home. But if so, that would be a discretionary choice of life 
style choice. There is nothing wrong with doing that per se, but I really 
don't think that an open-source foundation should be addressing discretionary 
life style choices.

-- 
雷米‧德尼-库尔蒙
http://www.remlab.net/



___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] Hardware purchase request: AVX512-capable laptop

2024-01-06 Thread Rémi Denis-Courmont

Le lauantaina 6. tammikuuta 2024, 19.59.47 EET Michael Niedermayer a écrit :
> What i do with my laptop is i have it on this thing:
> https://www.amazon.de/gp/product/B072PZLZ25

> That can adjust tilt, rotate and height (and of course it can be moved
> around on  the table)
> put a good keyboard below it and a good mouse to its right.

I think that I already addressed that up-thread? A docked laptop is basically 
the same as a desktop system, but more expensive and less powerful.

I can't imagine that Lynne would use the laptop for development from her home, 
whilst she already has a gotten quite the S-class monster of a desktop 
workstation for that purpose, whose technical specifications are sure to 
outclass any contemporary laptop.

-- 
雷米‧德尼-库尔蒙
http://www.remlab.net/



___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] Hardware purchase request: AVX512-capable laptop

2024-01-06 Thread Rémi Denis-Courmont



Le 6 janvier 2024 20:26:42 GMT+02:00, Michael Niedermayer 
 a écrit : 
>
>I think some kind of remotely usable system does make sense for every volunteer
>who wants to work. It simply results in more available time for that work.
>
>Even i (who doesnt travel volunteerly around) have needed and used my notebook
>for FFmpeg away from my desktop system many times.
>When ive spend some time in appartments of other familiy members, when i had to
>change my own apartment due to very noisy neighbors and so forth
>
>Maybe a compromise would be a cheap laptop that is just used to login and
>access the more powerfull hardware via SSH ?

That sounds much more sensible indeed.
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] Hardware purchase request: AVX512-capable laptop

2024-01-06 Thread Rémi Denis-Courmont

Le lauantaina 6. tammikuuta 2024, 12.38.28 EET Lynne a écrit :
> Emergencies could happen, but progress must always happen.

Laptops are more prone to breaking, and as already noted less serviceable. The 
whole premise is that your current laptop broke after just 2 years, while the 
normally (fiscally) expected lifetime of a laptop is 3 years.

Don't get me wrong! I don't deny that emergencies of that sort do happen to 
software engineers. Considering the "running cost" of a skilled software 
engineer, many employers will want to minimise the risk that they get bogged 
down by lack of a development computer, and the inability to carry a laptop 
with them on business trips.

But then, whose emergency would are they exactly? If an entity has dire needs 
of your continued ability to work, then they should take the measures and 
costs. That's just not something that the FFmpeg foundation should bare. 
Except maybe for Michael, I think the project will do just fine if any 
developer is out of a computer for a week, to be honest.

Finally, the flip side of this is that the ergonomy and performance of your 
FFmpeg development environment is at least as critical, if not more, to your 
continued ability to work. In other words, if a developer is critical to the 
project, then it is detrimental to the project if they use a laptop, because 
laptops are slower and less healthy.

> Also, I think some developers here would disagree with the notion that
> desktop machines are always the best option, and I think I that subjects
> such as ergonomy, uselessness when not docked,

That's simply not a matter of subjective opinion of an hypothetical developer. 
The point about ergonomy is generally accepted among specialists based on 
serious studies. And by specialists I mean medical doctors and occupational 
healthcare therapists, not FFmpeg developers. That's not "subjective" in my 
book.

> being less serviceable, are
> subjective metrics.

How about you count the number of parts that can be independently replaced in 
a laptop vs a desktop. Care to explain how that metric is a "subjective" 
exactly? And that's not even counting that some of the serviceable laptop 
parts are more or less model-dependent.

-- 
レミ・デニ-クールモン
http://www.remlab.net/
 


___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] Hardware purchase request: AVX512-capable laptop

2024-01-06 Thread Rémi Denis-Courmont

Le lauantaina 6. tammikuuta 2024, 18.21.00 EET Lynne a écrit :
> As for whether this is a lifestyle choice, we generally pay for anything
> that involves conferences, from train tickets, planes, parking, and
> sometimes for location/stand rent.

I would personally agree that representing FFmpeg at (non-FFmpeg-specific) 
conferences is a choice of life style. But it is normal to refund reasonable 
expenses made to represent the project.

> I'm asking for a useful bit of permanent hardware.

I don't question that providing you with one development system with the 
relevant Vulkan hardware support and AVX-512 is (or was) justified. If you do 
all the work for free (or paid by some other entity than FFmpeg), that's 
indeed excellent ROI.

But the "business" case for a *second* system with all the disadvantages of a 
laptop is frankly not so clear.

-- 
Rémi Denis-Courmont
http://www.remlab.net/



___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] Hardware purchase request: AVX512-capable laptop

2024-01-06 Thread Rémi Denis-Courmont

Le lauantaina 6. tammikuuta 2024, 18.13.33 EET Lynne a écrit :
> A fire would put me out for much more than a week tbh.

What aboutism much? In this case, you would loose your internet access, and 
potentially spend a long time hospitalised.

You're dodging the real issues here: why should *you* get a laptop? Sure, some 
of the stuff that you do will with high probability become important in the 
medium term (e.g. Vulkan video decoding stuff), but it does not seem so urgent 
as to justify purchasing a second computer. The foundation already invested in 
a well-above average price to equip you with a suitable desktop system for 
your development. It would be far more sensible to spend on updated or 
replacement parts for that system as needed, than to buy a whole new system 
just in case.

Furthermore, there are quite a few key developers and system adminstrators in 
the project whose continued ability to work is at least equally critical.

> Other than that, occasional trips, during which reviews still have to be
> made. Plus, power analysis of whether AVX512 helps on current-gen mobile
> devices. I don't see this as being too big of a thing to ask for,
> considering how much we have and how much we receive each year, and how
> very rarely requests are done. Nor something that deserves a lengthy
> article on the benefits, ergonomics, and ongoing maintenance of desktop
> versus mobile systems.

Indeed, you should not have made the preposterous argument that my points were 
"subjective" when they were not. Then I would not have had to waste time 
elaborating.

-- 
レミ・デニ-クールモン
http://www.remlab.net/



___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH 3/3] lavc/svq1enc: R-V V ssd_int8_vs_int16

2024-01-06 Thread Rémi Denis-Courmont

Le perjantaina 5. tammikuuta 2024, 2.56.18 EET flow gg a écrit :
> One vset can be reduced, but vwsub should not be used in this case. I
> modified it in this reply.

Fair enough, but are you sure that that's faster than keeping the vsetvli and 
removing the sign extension?

> Rémi Denis-Courmont  于2024年1月5日周五 00:00写道：
> 
> > Le lauantaina 30. joulukuuta 2023, 18.20.15 EET flow gg a écrit :
> > > I mistook it, seeing the vector length as the length of the vector
> > 
> > register
> > 
> > > ..
> > > I have modified it in this reply.
> > 
> > Setting element size to 8-bit is unnecessary, and a widening subtraction
> > can
> > presumably avoid the sign extension.
> > 
> > --
> > レミ・デニ-クールモン
> > http://www.remlab.net/
> > 
> > 
> > 
> > ___
> > ffmpeg-devel mailing list
> > ffmpeg-devel@ffmpeg.org
> > https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> > 
> > To unsubscribe, visit link above, or email
> > ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


-- 
Rémi Denis-Courmont
http://www.remlab.net/



___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH v2] checkasm: Generalize crash handling

2024-01-10 Thread Rémi Denis-Courmont

Looks OK (not tested).

-- 
Rémi Denis-Courmont
http://www.remlab.net/



___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH 3/3] lavc/svq1enc: R-V V ssd_int8_vs_int16

2024-01-07 Thread Rémi Denis-Courmont

Le sunnuntaina 7. tammikuuta 2024, 3.33.39 EET flow gg a écrit :
> I tested it, and indeed using vwsub is faster. Updated it in the reply.
> 
> ---
> 
> I have a question: if I tweak the load order a bit, using one less vset, it
> leads to being slower (the patch I submitted is 13.2, if I make the
> following change, the time would be 15.2).
> But I thought it would be faster.

I would guess that v0 is needed before v8 in the internal implementation of 
vwsub. This kind of makes sense as the element still need to be sign-extended. 
Thus vwsub ends up stalling the pipeline in wait for vle8 to complete. That's 
just a guess though, as I don't have internal cycle timing documentation.

> - vsetvli  t0, a2, e8, m2, tu, ma
> - vle8.v   v0, (a0)
> - sub  a2, a2, t0
> - vsetvli  zero, t0, e16, m4, tu, ma
> - vle16.v  v8, (a1)
> - vsetvli  zero, t0, e8, m2, tu, ma
> - vwsub.wv v16, v8, v0
> 
> + vsetvli  t0, a2, e16, m4, tu, ma
> + vle16.v  v8, (a1)
> + sub  a2, a2, t0
> + vsetvli  zero, t0, e8, m2, tu, ma
> + vle8.v   v0, (a0)
> + vwsub.wv v16, v8, v0

-- 
雷米‧德尼-库尔蒙
http://www.remlab.net/



___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH 3/3] lavc/svq1enc: R-V V ssd_int8_vs_int16

2024-01-16 Thread Rémi Denis-Courmont

Le sunnuntaina 7. tammikuuta 2024, 10.36.23 EET flow gg a écrit :
> Alright, I learned a bit more, so should we not consider the internal
> implementation?

You asked what the reason was for your counter-intuitive observations, and I 
provided a plausible hypothesis. Nothing more ,nothing less.

Of course we should take performance characteristics of real hardware into 
account, as is done on all other ISAs. The flip side however is that we might 
have to make tradeoffs when design from other vendors come out exhibiting 
different characteristics.

-- 
雷米‧德尼-库尔蒙
http://www.remlab.net/



___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH 3/3] lavc/svq1enc: R-V V ssd_int8_vs_int16

2024-01-16 Thread Rémi Denis-Courmont

+vsetvli  t0, a2, e8, m2, tu, ma
+vle8.v   v0, (a0)
+sub  a2, a2, t0
+vsetvli  zero, t0, e16, m4, tu, ma
+vle16.v  v8, (a1)
+vsetvli  zero, t0, e8, m2, tu, ma
+vwsub.wv v16, v8, v0
+vsetvli  zero, t0, e16, m4, tu, ma

It looks to me like the second vsetvli is unnecessary, and consequently the 
third as well. As for the later ones, please use `vsetvli zero, zero` if you 
intend to change SEW while preserving VL and the LMUL:SEW ratio.

-- 
雷米‧德尼-库尔蒙
http://www.remlab.net/



___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] Hardware purchase request: AVX512-capable laptop

2024-01-15 Thread Rémi Denis-Courmont

Le maanantaina 15. tammikuuta 2024, 16.06.32 EET Paul B Mahol a écrit :
> > I agree with Remi's objections to this.
> > 
> > Kieran
> 
> Poor and irrelevant devs object and want to keep money for themself.

Neither of us are poor, which makes this defamatory.

While we may subjectively be irrelavant, that is completely inappropriate 
wording. For you reference, Nicolas was able to articulate that 
characterisation in a much more business-compatible fashion.

So this is being reported to the CC.

-- 
雷米‧德尼-库尔蒙
http://www.remlab.net/



___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] Hardware purchase request: AVX512-capable laptop

2024-01-15 Thread Rémi Denis-Courmont

Le maanantaina 15. tammikuuta 2024, 16.59.40 EET Lynne a écrit :
> I've been pinging this for a week now and he hasn't reiterated
> his position again or made it clearer.

I think my position was clear. I don't see the point in rereiterating it 
whilst we are evidently not going to reach an agreement. Besides, you have 
previously complained that my explanations were unnecessarily long.

But since you bring it up and to sum up, I find completely reasonable for 
FFmpeg to provision hardware with the feature necessary to test your work, 
such as AVX-512 and Vulkan video decoding. But:
1) You already have been provided such hardware in the form of a desktop 
computer (and I am told that it was extremely expensive).
2) In general it makes more sense to get a desktop than a laptop for that 
purpose.

Leaving aside those specific hardware requirements, I think it is completely 
reasonable for you to have a laptop, as most of us probably do. But I also 
think that it is not reasonable for the foundation to pay for personal 
laptops.

Maybe you need a laptop specifically to work on FFmpeg for whatever reason. 
Then a cheap laptop for remote access, as Michael suggested, sounds like a 
reasonable compromise to me. Nevertheless, I think that:
- If your employment requires you to work away from your desktop a lot, then 
your employer should provide the laptop.
- If you want to work from your couch or from the beach (figuratively), that is 
really on you.

-- 
Rémi Denis-Courmont
http://www.remlab.net/



___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] Back port riscv: test for assembler support to 6.1

2024-01-18 Thread Rémi Denis-Courmont

Le sunnuntaina 7. tammikuuta 2024, 6.20.29 EET Brad Smith a écrit :
> I don't have a system. But I have attached what should be there or close
> to back ports for 6.1 and 6.0. If someone could please build test these
> patches.

I have no objections but I do not have a test system either.

In any case, the RISC-V support requires OS adaptation to detect multi-
lettered extensions, and it is very unlikely that I will be able to test 
OpenBSD (I don't even know how it's supposed to work).

-- 
Rémi Denis-Courmont
http://www.remlab.net/



___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH] checkasm: Test whether direct cycle counter access works

2024-01-11 Thread Rémi Denis-Courmont

Le torstaina 11. tammikuuta 2024, 14.53.05 EET Martin Storsjö a écrit :
> This should print a nicer error message than crashing due to
> an illegal instruction, if direct cycle counter access isn't
> allowed.
> 
> This matches the dav1d checkasm commit
> 95a192549a448b70d9542e840c4e34b60d09b093.
> ---
>  tests/checkasm/checkasm.c | 12 +++-
>  1 file changed, 11 insertions(+), 1 deletion(-)
> 
> diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c
> index 994d64e96b..9c5abb53dc 100644
> --- a/tests/checkasm/checkasm.c
> +++ b/tests/checkasm/checkasm.c
> @@ -754,6 +754,14 @@ static int bench_init_kperf(void)
>  static int bench_init_ffmpeg(void)
>  {
>  #ifdef AV_READ_TIME
> +if (!checkasm_save_context()) {
> +checkasm_set_signal_handler_state(1);
> +AV_READ_TIME();
> +checkasm_set_signal_handler_state(0);
> +} else {
> +fprintf(stderr, "checkasm: unable to access cycle counter\n");

AV_READ_TIME() reads time, not cycles. If we want cycle count, then we should 
add a separate macro, as the two are different performance counters at least on 
RISC-V. As things stand, this code won't do anything on RISC-V, sinec 
AV_READ_TIME() actually reads, well, time, not cycles.

> +return -1;
> +}
>  printf("benchmarking with native FFmpeg timers\n");
>  return 0;
>  #else
> @@ -927,7 +935,9 @@ int checkasm_bench_func(void)
>  /* Indicate that the current test has failed */
>  void checkasm_fail_func(const char *msg, ...)
>  {
> -if (state.current_func_ver->cpu && state.current_func_ver->ok) {
> +if (state.current_func_ver && state.current_func_ver->cpu &&
> +state.current_func_ver->ok)
> +{
>  va_list arg;
> 
>  print_cpu_name();


-- 
雷米‧德尼-库尔蒙
http://www.remlab.net/



___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH] checkasm: Test whether direct cycle counter access works

2024-01-11 Thread Rémi Denis-Courmont

Le torstaina 11. tammikuuta 2024, 16.15.29 EET Martin Storsjö a écrit :
> > AV_READ_TIME() reads time, not cycles.
> 
> Right, I can adjust the wording. Exactly what kind of measurement
> AV_READ_TIME returns varies between architectures and environments indeed.

In practice, yes, but I would argue that it's a bug if it does not measure 
time. At the very least because, the name is extremely misleading.

> What about:
> 
>  checkasm: unable to execute platform specific timer
> 
> > If we want cycle count, then we should add a separate macro, as the two
> > are different performance counters at least on RISC-V.
> 
> That's not what I try to do here, I just want to test whether the timer,
> whatever we have in AV_READ_TIME, is usable.

Sure, I can live with that, but I thought that checkasm actually prefered to 
measure cycles than time periods.

> > As things stand, this code won't do anything on RISC-V, sinec
> > AV_READ_TIME() actually reads, well, time, not cycles.
> 
> Should I interpret this, as, the current AV_READ_TIME implementation on
> RISC-V always succeeds, contrary to the previous implementation (with
> rdcycle) which is unavailable on some systems, referencing
> 05115a77e012331b6ff5e24bab40e75848447c62?

Yes.

-- 
雷米‧德尼-库尔蒙
http://www.remlab.net/



___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH] checkasm: Generalize crash handling

2023-12-21 Thread Rémi Denis-Courmont



Le 19 décembre 2023 14:02:00 GMT+02:00, "Martin Storsjö"  a 
écrit :
>This replaces the riscv specific handling from
>7212466e735aa187d82f51dadbce957fe3da77f0 (which essentially is
>reverted, together with 286d6742218ba0235c32876b50bf593cb1986353)
>with a different implementation of the same (plus a bit more), based
>on the corresponding feature in dav1d's checkasm, supporting both Unix
>and Windows.
>
>See in particular dav1d commits
>0b6ee30eab2400e4f85b735ad29a68a842c34e21 and
>0421f787ea592fd2cc74c887f20b8dc31393788b, authored by
>Henrik Gramner.
>
>The overall approach is the same; set up a signal handler,
>store the state with setjmp/sigsetjmp, jump out of the crashing
>function with longjmp/siglongjmp.
>
>The main difference is in what happens when the signal handler
>is invoked. In the previous implementation, it would resume from
>right before calling the crashing function, and then skip that call
>based on the setjmp return value.
>
>In the imported implementation from dav1d, we return to right before
>the check_func() call, which will skip testing the current function
>(as the pointer is the same as it was before).
>
>Other differences are:
>- Support for other signal handling mechanisms (Windows
>  AddVectoredExceptionHandler)
>- Using RtlCaptureContext/RtlRestoreContext instead of setjmp/longjmp
>  on Windows with SEH (which adds the design limitation that it doesn't
>  return a value like setjmp does)
>- Only catching signals once per function - if more than one
>  signal is delivered before signal handling is reenabled, any
>  signal is handled as it would without our handler
>- Not using an arch specific signal handler written in assembly
>---
> tests/checkasm/checkasm.c   | 100 ++--
> tests/checkasm/checkasm.h   |  79 ++---
> tests/checkasm/riscv/checkasm.S |  12 
> 3 files changed, 140 insertions(+), 51 deletions(-)
>
>diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c
>index 6318d9296b..668034c67f 100644
>--- a/tests/checkasm/checkasm.c
>+++ b/tests/checkasm/checkasm.c
>@@ -23,8 +23,10 @@
> #include "config.h"
> #include "config_components.h"
> 
>-#ifndef _GNU_SOURCE
>-# define _GNU_SOURCE // for syscall (performance monitoring API), strsignal()
>+#if CONFIG_LINUX_PERF
>+# ifndef _GNU_SOURCE
>+#  define _GNU_SOURCE // for syscall (performance monitoring API)
>+# endif
> #endif
> 
> #include 
>@@ -326,6 +328,7 @@ static struct {
> const char *cpu_flag_name;
> const char *test_name;
> int verbose;
>+int catch_signals;
> } state;
> 
> /* PRNG state */
>@@ -627,6 +630,64 @@ static CheckasmFunc *get_func(CheckasmFunc **root, const 
>char *name)
> return f;
> }
> 
>+checkasm_context checkasm_context_buf;
>+
>+/* Crash handling: attempt to catch crashes and handle them
>+ * gracefully instead of just aborting abruptly. */
>+#ifdef _WIN32
>+#if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP)
>+static LONG NTAPI signal_handler(EXCEPTION_POINTERS *const e) {
>+const char *err;
>+
>+if (!state.catch_signals)
>+return EXCEPTION_CONTINUE_SEARCH;
>+
>+switch (e->ExceptionRecord->ExceptionCode) {
>+case EXCEPTION_FLT_DIVIDE_BY_ZERO:
>+case EXCEPTION_INT_DIVIDE_BY_ZERO:
>+err = "fatal arithmetic error";
>+break;
>+case EXCEPTION_ILLEGAL_INSTRUCTION:
>+case EXCEPTION_PRIV_INSTRUCTION:
>+err = "illegal instruction";
>+break;
>+case EXCEPTION_ACCESS_VIOLATION:
>+case EXCEPTION_ARRAY_BOUNDS_EXCEEDED:
>+case EXCEPTION_DATATYPE_MISALIGNMENT:
>+case EXCEPTION_STACK_OVERFLOW:
>+err = "segmentation fault";
>+break;
>+case EXCEPTION_IN_PAGE_ERROR:
>+err = "bus error";
>+break;
>+default:
>+return EXCEPTION_CONTINUE_SEARCH;
>+}
>+state.catch_signals = 0;
>+checkasm_fail_func("%s", err);
>+checkasm_load_context();
>+return EXCEPTION_CONTINUE_EXECUTION; /* never reached, but shuts up gcc */
>+}
>+#endif
>+#else
>+static void signal_handler(const int s) {
>+if (state.catch_signals) {
>+state.catch_signals = 0;
>+checkasm_fail_func("%s",
>+   s == SIGFPE ? "fatal arithmetic error" :
>+   s == SIGILL ? "illegal instruction" :
>+   s == SIGBUS ? "bus error" :
>+ "segmentation fault");
>+checkasm_load_context();

Use of format string is probably not async-signal-safe. I would also be 
surprised if the load_context() function was safe in signal context. That's why 
the current code does pretty much nothing other than a long jump.

>+} else {
>+/* fall back to the default signal handler */
>+static const struct sigaction default_sa = { .sa_handler = SIG_DFL };
>+sigaction(s, _sa, NULL);
>+raise(s);
>+}
>+}
>+#endif
>+
> /* Perform tests and benchmarks for the specified cpu flag if supported by 
>

Re: [FFmpeg-devel] [RFC] fftools/ffmpeg and libavdevice/sdl issue

2023-12-19 Thread Rémi Denis-Courmont



Le 19 décembre 2023 14:51:21 GMT+02:00, Nicolas George  a 
écrit :
>Rémi Denis-Courmont (12023-12-19):
>> Anton's objections are against the horrible hacks necessary to support
>> Mac and Windows, as far as I understand him.
>
>I have not read that. If that is true, maybe he could start with
>refraining from using expressions like “horrible hacks”.
>
>> Of course it's also objectionable for SDL to be modelled as a muxer,
>
>Sigh. Do we have to explain this once again? Devices have to present as
>muxers and demuxers in order to be usable transparently by applications
>designed for plain files. And anyway, the manner frames enter or leave a
>device is orthogonal to the implementation of said device, so bringing
>this question in the discussion is irrelevant.

That's a horrible hack of the kind that makes one infer that whoever wrote the 
library doesn't understand API design.

>> Running on the main thread (the initial thread of an address space)
>> requires an external executable
>
>No. Or [citation needed].

I don't care if you disagree with the definition of "main thread" in the 
context of SDL.

>> Besides, starting a new process without execution of an executable, in
>> other words, forking without executing, is essentially impossible in a
>> multithreaded Unix-like environment,
>
>It is less than standards-compliant and portable, but it is doable.

You could certainly engineer a custom OS that would allow this, but I don't 
think that's really relevant, whilst the issue at stake is support for Apple's 
OS. Hence "essentially impossible" as opposed to "impossible".

>
>> since FFmpeg is not async-fork-safe.
>
>This is something that should be fixed, do you not think?

First, good luck with that. Making FFmpeg work under POSIX fork-safe 
constraints is simply not realistic, not to mention the underlying libraries 
that FFmpeg would have to fork (pun unintended). If it were feasible, we 
wouldn't need to have this argument: somebody could just fix the SDL muxer 
internals without messing with the FFmpeg APIs.

Second, even if you did succeed at this,  the result would be unmaintainable, 
as you'd have to mind those constraints for all future code changes.

And third, you would leak memory and resources of other threads that just 
happened to be allocated to the parent process at the time of fork. This is 
highly undesirable.
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH] checkasm: add test for dcmul_add

2023-11-26 Thread Rémi Denis-Courmont

Will push soon except for objections 
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH] checkasm: add test for dcmul_add

2023-11-26 Thread Rémi Denis-Courmont

Le sunnuntaina 19. marraskuuta 2023, 0.28.10 EET flow gg a écrit :
> From 2785ce57f68dbb2373c951b9432afa73796f7cc1 Mon Sep 17 00:00:00 2001
> From: sunyuechi 
> Date: Sat, 18 Nov 2023 10:58:17 +0800
> Subject: [PATCH] checkasm: test for dcmul_add

git-am reports the patch corrupt.

-- 
Rémi Denis-Courmont
http://www.remlab.net/



___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH] checkasm: Fix the signature of float_to_fixed24

2023-12-02 Thread Rémi Denis-Courmont

Lgtm
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [ANNOUNCE] upcoming vote: TC/CC elections

2023-12-05 Thread Rémi Denis-Courmont



Le 5 décembre 2023 15:28:54 GMT+02:00, James Almer  a écrit :
>On 12/5/2023 7:07 AM, Anton Khirnov wrote:
>> Hi all,
>> Both elections have now concluded.
>> 
>> We have 36 votes for the CC election (70% turnout) and 38 votes for TC
>> (75% turnout); raw votes in CSV format are attached.
>> 
>> The CC members now are:
>> * James Almer
>> * Jean-Baptiste Kempf
>> * Anton Khirnov
>> * Ronald Bultje
>> * Michael Niedermayer
>> 
>> For TC, it seems that we have a tie. The system reports two winning
>> sets, both of which contain:
>> * Michael Niedermayer
>> * Martin Storsjö
>> * Mark Thompson
>> * Anton Khirnov
>> 
>> The final member is Jan Ekström in one set and Niklas Haas in the other.
>> We should now consider how to break this tie. Some options suggested on
>> IRC were:
>> * run a new vote with just the two of them
>> * randomly
>> * have Rémi break the tie, as he said he accidentally voted incorrectly
>>due to misinterpreting the documentation
>
>This doesn't feel nice, having one person decide like this.

I don't want to do that anyway as that feels like a very uncomfortable position 
to be in. Nobody can force me. So that option is off the table.

>> * expand the committee
>
>Six members mean a vote could end up in a tie.
>
>IMO, either we do a new vote with the two of them as options, or one of them 
>steps down.
>___
>ffmpeg-devel mailing list
>ffmpeg-devel@ffmpeg.org
>https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
>To unsubscribe, visit link above, or email
>ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH] doc: mention that for RISC-V, we prefer .S files

2023-12-05 Thread Rémi Denis-Courmont



Le 5 décembre 2023 11:59:39 GMT+02:00, Jean-Baptiste Kempf  
a écrit :
>$subject
>
>See attachment.

I think that the non-ISA specification is a better reference than GNU/binutils. 
The later takes some controversial liberties from the earlier. And while I 
blame LLVM as a project for sitting on the `.option arch` support patch set for 
months and months, I don't blame them for adhering to the specification where 
binutils doesn't.
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH] riscv: test for assembler support

2023-12-05 Thread Rémi Denis-Courmont

This should fix the build on LLVM 16 and earlier, at the cost of turning
all non-RVV optimisations off.
---
 Makefile|  6 +++---
 configure   |  5 -
 ffbuild/arch.mak|  1 +
 libavcodec/riscv/Makefile   | 16 
 libavcodec/riscv/ac3dsp_init.c  |  2 ++
 libavcodec/riscv/audiodsp_init.c|  2 ++
 libavcodec/riscv/bswapdsp_init.c|  2 ++
 libavcodec/riscv/pixblockdsp_init.c |  2 ++
 libswscale/riscv/Makefile   |  2 +-
 libswscale/riscv/rgb2rgb.c  |  2 ++
 tests/checkasm/Makefile |  2 +-
 tests/checkasm/checkasm.c   |  2 +-
 tests/checkasm/checkasm.h   |  5 -
 13 files changed, 33 insertions(+), 16 deletions(-)

diff --git a/Makefile b/Makefile
index 78652c47bd..2fc3e538c1 100644
--- a/Makefile
+++ b/Makefile
@@ -93,10 +93,10 @@ ffbuild/.config: $(CONFIGURABLE_COMPONENTS)
 SUBDIR_VARS := CLEANFILES FFLIBS HOSTPROGS TESTPROGS TOOLS   \
HEADERS ARCH_HEADERS BUILT_HEADERS SKIPHEADERS\
ARMV5TE-OBJS ARMV6-OBJS ARMV8-OBJS VFP-OBJS NEON-OBJS \
-   ALTIVEC-OBJS VSX-OBJS RVV-OBJS MMX-OBJS X86ASM-OBJS   \
+   ALTIVEC-OBJS VSX-OBJS MMX-OBJS X86ASM-OBJS\
MIPSFPU-OBJS MIPSDSPR2-OBJS MIPSDSP-OBJS MSA-OBJS \
-   MMI-OBJS LSX-OBJS LASX-OBJS OBJS SLIBOBJS SHLIBOBJS   \
-   STLIBOBJS HOSTOBJS TESTOBJS
+   MMI-OBJS LSX-OBJS LASX-OBJS RV-OBJS RVV-OBJS  \
+   OBJS SLIBOBJS SHLIBOBJS STLIBOBJS HOSTOBJS TESTOBJS
 
 define RESET
 $(1) :=
diff --git a/configure b/configure
index d77c053226..7d2ee66000 100755
--- a/configure
+++ b/configure
@@ -2154,6 +2154,7 @@ ARCH_EXT_LIST_PPC="
 "
 
 ARCH_EXT_LIST_RISCV="
+rv
 rvv
 "
 
@@ -2679,7 +2680,8 @@ ppc4xx_deps="ppc"
 vsx_deps="altivec"
 power8_deps="vsx"
 
-rvv_deps="riscv"
+rv_deps="riscv"
+rvv_deps="rv"
 
 loongson2_deps="mips"
 loongson3_deps="mips"
@@ -6243,6 +6245,7 @@ elif enabled ppc; then
 
 elif enabled riscv; then
 
+enabled rv && check_inline_asm rv '".option arch, +zbb\nrev8 t0, t1"'
 enabled rvv && check_inline_asm rvv '".option arch, +v\nvsetivli zero, 0, 
e8, m1, ta, ma"'
 
 elif enabled x86; then
diff --git a/ffbuild/arch.mak b/ffbuild/arch.mak
index 39d76ee152..23a3feb090 100644
--- a/ffbuild/arch.mak
+++ b/ffbuild/arch.mak
@@ -15,6 +15,7 @@ OBJS-$(HAVE_LASX)  += $(LASX-OBJS)   $(LASX-OBJS-yes)
 OBJS-$(HAVE_ALTIVEC) += $(ALTIVEC-OBJS) $(ALTIVEC-OBJS-yes)
 OBJS-$(HAVE_VSX) += $(VSX-OBJS) $(VSX-OBJS-yes)
 
+OBJS-$(HAVE_RV)  += $(RV-OBJS)  $(RV-OBJS-yes)
 OBJS-$(HAVE_RVV) += $(RVV-OBJS) $(RVV-OBJS-yes)
 
 OBJS-$(HAVE_MMX) += $(MMX-OBJS) $(MMX-OBJS-yes)
diff --git a/libavcodec/riscv/Makefile b/libavcodec/riscv/Makefile
index 2d0e6c19c8..74381e3648 100644
--- a/libavcodec/riscv/Makefile
+++ b/libavcodec/riscv/Makefile
@@ -1,14 +1,14 @@
 OBJS-$(CONFIG_AAC_DECODER) += riscv/aacpsdsp_init.o riscv/sbrdsp_init.o
 RVV-OBJS-$(CONFIG_AAC_DECODER) += riscv/aacpsdsp_rvv.o riscv/sbrdsp_rvv.o
-OBJS-$(CONFIG_AC3DSP) += riscv/ac3dsp_init.o \
- riscv/ac3dsp_rvb.o
+OBJS-$(CONFIG_AC3DSP) += riscv/ac3dsp_init.o
+RV-OBJS-$(CONFIG_AC3DSP) +=riscv/ac3dsp_rvb.o
 OBJS-$(CONFIG_ALAC_DECODER) += riscv/alacdsp_init.o
 RVV-OBJS-$(CONFIG_ALAC_DECODER) += riscv/alacdsp_rvv.o
-OBJS-$(CONFIG_AUDIODSP) += riscv/audiodsp_init.o \
-   riscv/audiodsp_rvf.o
+OBJS-$(CONFIG_AUDIODSP) += riscv/audiodsp_init.o
+RV-OBJS-$(CONFIG_AUDIODSP) += riscv/audiodsp_rvf.o
 RVV-OBJS-$(CONFIG_AUDIODSP) += riscv/audiodsp_rvv.o
-OBJS-$(CONFIG_BSWAPDSP) += riscv/bswapdsp_init.o \
-   riscv/bswapdsp_rvb.o
+OBJS-$(CONFIG_BSWAPDSP) += riscv/bswapdsp_init.o
+RV-OBJS-$(CONFIG_BSWAPDSP) += riscv/bswapdsp_rvb.o
 RVV-OBJS-$(CONFIG_BSWAPDSP) += riscv/bswapdsp_rvv.o
 OBJS-$(CONFIG_EXR_DECODER) += riscv/exrdsp_init.o
 RVV-OBJS-$(CONFIG_EXR_DECODER) += riscv/exrdsp_rvv.o
@@ -34,8 +34,8 @@ OBJS-$(CONFIG_LLVIDENCDSP) += riscv/llvidencdsp_init.o
 RVV-OBJS-$(CONFIG_LLVIDENCDSP) += riscv/llvidencdsp_rvv.o
 OBJS-$(CONFIG_OPUS_DECODER) += riscv/opusdsp_init.o
 RVV-OBJS-$(CONFIG_OPUS_DECODER) += riscv/opusdsp_rvv.o
-OBJS-$(CONFIG_PIXBLOCKDSP) += riscv/pixblockdsp_init.o \
-  riscv/pixblockdsp_rvi.o
+OBJS-$(CONFIG_PIXBLOCKDSP) += riscv/pixblockdsp_init.o
+RV-OBJS-$(CONFIG_PIXBLOCKDSP) += riscv/pixblockdsp_rvi.o
 RVV-OBJS-$(CONFIG_PIXBLOCKDSP) += riscv/pixblockdsp_rvv.o
 OBJS-$(CONFIG_UTVIDEO_DECODER) += riscv/utvideodsp_init.o
 RVV-OBJS-$(CONFIG_UTVIDEO_DECODER) += riscv/utvideodsp_rvv.o
diff --git a/libavcodec/riscv/ac3dsp_init.c b/libavcodec/riscv/ac3dsp_init.c
index 20f294f1de..92678ea810 100644
--- a/libavcodec/riscv/ac3dsp_init.c
+++ b/libavcodec/riscv/ac3dsp_init.c
@@ -29,10 +29,12 @@ void ff_extract_exponents_rvb(uint8_t *exp, int32_t *coef, 
int nb_coefs);

Re: [FFmpeg-devel] [PATCH] lavc/vc1dsp: R-V V inv_trans

2023-12-05 Thread Rémi Denis-Courmont

Instruction scheduling could be better, especially on in-order CPUs.

> +vzext.vf2 v8, v0
> +vadd.vx   v8, v8, t2
> +vmax.vx   v8, v8, zero
> +vsetvli   zero, t0, e8, m4, ta, ma

You don't need to reset the AVL here, just pass zero.

> +vnclipu.wiv0, v8, 0
> +vsetivli  zero, 8, e8, mf2, ta, ma
> +vsse64.v  v0, (a0), a1
> +ret
> +endfunc
> +
> +func ff_vc1_inv_trans_4x8_dc_rvv, zve32x
> +lht2, (a2)
> +slli  t1, t2, 4
> +add   t2, t2, t1
> +addi  t2, t2, 4
> +srai  t2, t2, 3
> +sh1addt2, t2, t2
> +slli  t2, t2, 2
> +addi  t2, t2, 64
> +srai  t2, t2, 7
> +vsetivli  zero, 8, e8, mf2, ta, ma
> +vlse32.v  v0, (a0), a1
> +lit0, 4*8
> +vsetvli   zero, t0, e16, m4, ta, ma
> +vzext.vf2 v4, v0
> +vadd.vx   v4, v4, t2
> +vmax.vx   v4, v4, zero
> +vsetvli   zero, t0, e8, m2, ta, ma
> +vnclipu.wiv0, v4, 0
> +vsetivli  zero, 8, e8, mf2, ta, ma
> +vsse32.v  v0, (a0), a1
> +ret
> +endfunc
> +
> +func ff_vc1_inv_trans_8x4_dc_rvv, zve64x
> +lht2, (a2)
> +sh1addt2, t2, t2
> +addi  t2, t2, 1
> +srai  t2, t2, 1
> +slli  t1, t2, 4
> +add   t2, t2, t1
> +addi  t2, t2, 64
> +srai  t2, t2, 7
> +vsetivli  zero, 8, e8, mf2, ta, ma
> +vlse64.v  v0, (a0), a1
> +lit0, 8*4
> +vsetvli   zero, t0, e16, m4, ta, ma
> +vzext.vf2 v4, v0
> +vadd.vx   v4, v4, t2
> +vmax.vx   v4, v4, zero
> +vsetvli   zero, t0, e8, m2, ta, ma
> +vnclipu.wiv0, v4, 0
> +vsetivli  zero, 8, e8, mf2, ta, ma
> +vsse64.v  v0, (a0), a1
> +ret
> +endfunc
> +
> +func ff_vc1_inv_trans_4x4_dc_rvv, zve32x
> +lht2, (a2)
> +slli  t1, t2, 4
> +add   t2, t2, t1
> +addi  t2, t2, 4
> +srai  t2, t2, 3
> +slli  t1, t2, 4
> +add   t2, t2, t1
> +addi  t2, t2, 64
> +srai  t2, t2, 7
> +vsetivli  zero, 4, e8, mf2, ta, ma
> +vlse32.v  v0, (a0), a1
> +lit0, 4*4
> +vsetvli   zero, t0, e16, m2, ta, ma

vsetivli

> +vzext.vf2 v2, v0
> +vadd.vx   v2, v2, t2
> +vmax.vx   v2, v2, zero
> +vsetvli   zero, t0, e8, m1, ta, ma
> +vnclipu.wiv0, v2, 0
> +vsetivli  zero, 4, e8, mf2, ta, ma
> +vsse32.v  v0, (a0), a1
> +ret
> +endfunc
> diff --git a/libavcodec/vc1dsp.c b/libavcodec/vc1dsp.c
> index 62c8eb21fa..2caa3c6863 100644
> --- a/libavcodec/vc1dsp.c
> +++ b/libavcodec/vc1dsp.c
> @@ -1039,6 +1039,8 @@ av_cold void ff_vc1dsp_init(VC1DSPContext *dsp)
>  ff_vc1dsp_init_arm(dsp);
>  #elif ARCH_PPC
>  ff_vc1dsp_init_ppc(dsp);
> +#elif ARCH_RISCV
> +ff_vc1dsp_init_riscv(dsp);
>  #elif ARCH_X86
>  ff_vc1dsp_init_x86(dsp);
>  #elif ARCH_MIPS
> diff --git a/libavcodec/vc1dsp.h b/libavcodec/vc1dsp.h
> index 7ed1776ca7..e3b90d2b62 100644
> --- a/libavcodec/vc1dsp.h
> +++ b/libavcodec/vc1dsp.h
> @@ -89,6 +89,7 @@ void ff_vc1dsp_init(VC1DSPContext* c);
>  void ff_vc1dsp_init_aarch64(VC1DSPContext* dsp);
>  void ff_vc1dsp_init_arm(VC1DSPContext* dsp);
>  void ff_vc1dsp_init_ppc(VC1DSPContext *c);
> +void ff_vc1dsp_init_riscv(VC1DSPContext *c);
>  void ff_vc1dsp_init_x86(VC1DSPContext* dsp);
>  void ff_vc1dsp_init_mips(VC1DSPContext* dsp);
>  void ff_vc1dsp_init_loongarch(VC1DSPContext* dsp);


-- 
Rémi Denis-Courmont
http://www.remlab.net/



___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH] lavc/vc1dsp: R-V V inv_trans

2023-12-05 Thread Rémi Denis-Courmont

Le tiistaina 5. joulukuuta 2023, 21.25.12 EET flow gg a écrit :
> > This block can be folded into the next. You don't need to check VLENB
> 
> twice.
> 
> Changed.
> 
> > Instruction scheduling could be better, especially on in-order CPUs.
> 
> I put the vload at the front, and then proceeded with the t2 operation, but
> I'm not sure...
> 
> > You don't need to reset the AVL here, just pass zero.
> 
> Changed.
> 
> > vsetivli
> 
> Changed.

You changed more than I asked for. The immediate AVL is a 5-bit unsigned 
integer, so it should not be possible to assemble 32 or 64, unless you have a 
preprocessor that silently rewrites `vsetivli` into `vsetvli` (If so, that 
sounds very iffy because `vsetivli zero` has no scratch X register to work 
with).

FWIW CanMV-K230 boards are on sale for under 500 RMB.

-- 
レミ・デニ-クールモン
http://www.remlab.net/



___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH] fate.sh: Allow overriding what targets to make for running the tests

2023-11-30 Thread Rémi Denis-Courmont



Le 30 novembre 2023 23:13:59 GMT+02:00, "Martin Storsjö"  a 
écrit :
>On Thu, 30 Nov 2023, Rémi Denis-Courmont wrote:
>
>> You can already test it properly as things stand, and reporting is trivial, 
>> just not to the FATE website. The question is whether this is worth adding 
>> to FATE.
>
>More public test coverage is better than less, isn't it?

That's a false dichotomy.

>> In other words, is publishing on the FATE website worth making the tests 
>> coverage and/or the build time worse?
>
>By making the test coverage worse, you mean if I'd be doing the full testing 
>of many combinations already, and I'd stop doing that in order to do this 
>lesser testing instead? If I'd be doing it (I currently don't) I guess that 
>would be my concern, not others?

No. The point is that this is adding a small hack that works for one specific 
case for a short while (testing Armv8 IMM8 and DP), but is known not to be 
sufficient anyway (for SVE, PAuth, RVV, etc).

In the end, it's all about not adding inadequate interfaces and 
supporting/publishing bad solutions. It's certainly not as bad as if it were a 
public C API, but that doesn't make it good. Normally "insufficient" interfaces 
don't get merged for a variety of reasons.

>>> Again, for SVE, I'd rather have testing with 1 config (the default, which
>>> is longer vectors than one usually encounters in HW) rather than none at
>>> all. It won't catch every theoretical issue but practically would catch
>>> many things at least.
>> 
>> I find that statement very misleading. This is not a question of testing 1 
>> config vs 0. It's a question of testing 1 configuration vs all of them(*), 
>> and reporting that one vs reporting all of them elsewhere than 
>> FATE.ffmpeg.org. Until/unless somebody does the missing integration.
>
>Currently I test 0 of these configurations. I would like to test 1 such 
>config, and publish those results on the FATE website. I don't currently test 
>any form of "all configs". And if I wanted to make a private setup for testing 
>"all configs", I really don't see how it would be mutually exclusive with the 
>publicly posted test results from the one config?
>
>>> And in order to actually test BTI, one has to link with a sysroot that
>>> also was built with BTI enabled - I currently use a sysroot extracted from
>>> fedora for that. (And my tests for it use -Wl,-z,force-bti.)
>> 
>> I can readily believe how much of a PITA that would be to set up. I can also 
>> believe that glibc won't allow masking the guarded page bit in mmap()/
>> mprotect().
>> 
>> That does not mean you need different builds to test each of the 4 possible 
>> combinations (or 3 if you ignore the case of BTI without PAC, which does not 
>> exist in real hardware). Once you have that build, you can test it with 
>> whichever QEMU CPU settings.
>
>I didn't mean to imply that one would have to do separate builds for all of 
>those. I currently don't do any testing with builds with 
>-mbranch-protection=standard (and with different sysroots), but I was 
>considering adding one such build, with the fedora sysroot - and only test one 
>single configuration with it (with QEMU's defaults of all features enabled).
>
>
>So, to spell out your objection in simpler terms. You are firmly against 
>anybody posting test results on FATE that only include checkasm but not the 
>rest of the tests, because you consider that this can be misleading/confusing 
>to people reading the test results - is that right?
>
>Or would such a setup be acceptable to you, if someone would implement a way 
>of running the tests (either the full set or only a subset such as chckasm) 
>multiple times with different QEMU configurations, with the same build of 
>ffmpeg, within the same FATE run?
>
>// Martin
>___
>ffmpeg-devel mailing list
>ffmpeg-devel@ffmpeg.org
>https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
>To unsubscribe, visit link above, or email
>ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH] fate.sh: Allow overriding what targets to make for running the tests

2023-12-01 Thread Rémi Denis-Courmont



Le 1 décembre 2023 09:55:15 GMT+02:00, "Martin Storsjö"  a 
écrit :
>On Fri, 1 Dec 2023, Rémi Denis-Courmont wrote:
>
>> Le 30 novembre 2023 23:13:59 GMT+02:00, "Martin Storsjö"  
>> a écrit :
>>> On Thu, 30 Nov 2023, Rémi Denis-Courmont wrote:
>> 
>>>> In other words, is publishing on the FATE website worth making the tests 
>>>> coverage and/or the build time worse?
>>> 
>>> By making the test coverage worse, you mean if I'd be doing the full 
>>> testing of many combinations already, and I'd stop doing that in order to 
>>> do this lesser testing instead? If I'd be doing it (I currently don't) I 
>>> guess that would be my concern, not others?
>> 
>> No. The point is that this is adding a small hack that works for one 
>> specific case for a short while (testing Armv8 IMM8 and DP), but is known 
>> not to be sufficient anyway (for SVE, PAuth, RVV, etc).
>
>I'll reiterate the question from the bottom of the mail, that you didn't 
>respond to.
>
>Would you be ok with a setup, where a FATE instance optionally can run a 
>subset of tests instead of the full suite, but run them multiple times with 
>e.g. different QEMU settings? That would allow repeating checkasm for all the 
>interesting cases - and if one really wanted to spend a lot of CPU time on it, 
>also could run the full FATE suite in all those configurations.

Being able to run tests under a different runner/wrapper or the same runner 
with different settings, would be a lot more viable, indeed IMO

>
>// Martin
>___
>ffmpeg-devel mailing list
>ffmpeg-devel@ffmpeg.org
>https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
>To unsubscribe, visit link above, or email
>ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH] ac3dsp: RISC-V V float_to_fixed24

2023-12-01 Thread Rémi Denis-Courmont

Le perjantaina 24. marraskuuta 2023, 0.39.39 EET flow gg a écrit :
> Okay, changed

src/libavcodec/riscv/ac3dsp_init.c: In function ‘ff_ac3dsp_init_riscv’:
src/libavcodec/riscv/ac3dsp_init.c:39:33: warning: assignment to ‘void (*)
(int32_t *, const float *, size_t)’ {aka ‘void (*)(int *, const float *, long 
unsigned int)’} from incompatible pointer type ‘void (*)(int32_t *, const float 
*, unsigned int)’ {aka ‘void (*)(int *, const float *, unsigned int)’} [-
Wincompatible-pointer-types]
   39 | c->float_to_fixed24 = ff_float_to_fixed24_rvv;
  | ^

Also the Makefile precondition is inaccurate.

-- 
雷米‧德尼-库尔蒙
http://www.remlab.net/



___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH] ac3dsp: RISC-V V float_to_fixed24

2023-12-01 Thread Rémi Denis-Courmont

Le perjantaina 1. joulukuuta 2023, 20.35.10 EET Rémi Denis-Courmont a écrit :
> Le perjantaina 24. marraskuuta 2023, 0.39.39 EET flow gg a écrit :
> > Okay, changed
> 
> src/libavcodec/riscv/ac3dsp_init.c: In function ‘ff_ac3dsp_init_riscv’:
> src/libavcodec/riscv/ac3dsp_init.c:39:33: warning: assignment to ‘void (*)
> (int32_t *, const float *, size_t)’ {aka ‘void (*)(int *, const float *,
> long unsigned int)’} from incompatible pointer type ‘void (*)(int32_t *,
> const float *, unsigned int)’ {aka ‘void (*)(int *, const float *, unsigned
> int)’} [- Wincompatible-pointer-types]
>39 | c->float_to_fixed24 = ff_float_to_fixed24_rvv;
> 
>   | ^
> 
> Also the Makefile precondition is inaccurate.

Oh, and on C908, LMUL=8 is actually faster than LMUL=4. Generally speaking, 
you should maximise the LMUL unless there is a *specific* reason not to.

-- 
レミ・デニ-クールモン
http://www.remlab.net/



___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] RISC-V dev kit recommendations

2023-12-01 Thread Rémi Denis-Courmont

Le perjantaina 1. joulukuuta 2023, 21.44.24 EET Sean McGovern a écrit :
> If I wanted to purchase a RISC-V developer kit, does anyone have
> suggestions of what to buy? Or even what to steer clear of?

As this is FFmpeg-devel, I don't suppose you are looking for a 
microcontroller. To run Linux, the best option at the moment is the StarFive 
VisionFive 2. However it lacks all the new fancy extensions such as vectors, 
virtualisation and/or cryptography. In fact, I am not aware of commercial 
hardware with either of the later two.

The otherwise modest Canaan CanMV-K230 constitutes the only commercial device 
currently with vectors, including half-precision, at least to my knowledge. 
However it is currently on back-orders.

-- 
レミ・デニ-クールモン
http://www.remlab.net/



___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH] checkasm/ac3dsp: add float_to_fixed24 test

2023-11-23 Thread Rémi Denis-Courmont

Le torstaina 23. marraskuuta 2023, 9.08.16 EET flow gg a écrit :
> 

You should probably add the test case to tests/fate/checkasm.mak

-- 
レミ・デニ-クールモン
http://www.remlab.net/
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH] ac3dsp: RISC-V V float_to_fixed24

2023-11-23 Thread Rémi Denis-Courmont

Le torstaina 23. marraskuuta 2023, 1.17.03 EET flow gg a écrit :
> Hello, I saw the new commit "avcodec/ac3dsp: make len a size_t in
> float_to_fixed24."
> 
> So I removed the part #if (__riscv_xlen == 64) and restored the patch.

You're not checking for Zba. Also 'bnez'  would be more logical than 'bgtz' 
for an unsigned counter.

-- 
レミ・デニ-クールモン
http://www.remlab.net/



___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH 3/3] avcodec/ac3dsp: make len a size_t in float_to_fixed24

2023-11-22 Thread Rémi Denis-Courmont

Le keskiviikkona 22. marraskuuta 2023, 21.49.13 EET James Almer a écrit :
> Should simplify asm implementations, and prevent UB on at least win64.
> 
> Signed-off-by: James Almer 

This one looks good to me, but I am utterly incompetent for the previous two.

-- 
雷米‧德尼-库尔蒙
http://www.remlab.net/



___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH] lavc/vc1dsp: R-V V inv_trans

2023-12-03 Thread Rémi Denis-Courmont

Le sunnuntaina 3. joulukuuta 2023, 16.40.08 EET flow gg a écrit :
> c910
> vc1dsp.vc1_inv_trans_4x4_dc_c: 84.0
> vc1dsp.vc1_inv_trans_4x4_dc_rvv_i32: 74.0
> vc1dsp.vc1_inv_trans_4x8_dc_c: 150.2
> vc1dsp.vc1_inv_trans_4x8_dc_rvv_i32: 83.5
> vc1dsp.vc1_inv_trans_8x4_dc_c: 129.0
> vc1dsp.vc1_inv_trans_8x4_dc_rvv_i64: 75.7
> vc1dsp.vc1_inv_trans_8x8_dc_c: 254.7
> vc1dsp.vc1_inv_trans_8x8_dc_rvv_i64: 90.5

The code below uses fractional multipliers, so I infer that the benchmarked 
code was significantly different, and the measurements are not really worth the 
bother.

I know that supply is a problem at the moment, but I if you are going to keep 
this up, I would hope that ISCAS can get you access to an RVV 1.0 board.

In-line...

> diff --git a/libavcodec/riscv/vc1dsp_init.c b/libavcodec/riscv/vc1dsp_init.c
> new file mode 100644
> index 00..88e0434f0e
> --- /dev/null
> +++ b/libavcodec/riscv/vc1dsp_init.c
> @@ -0,0 +1,47 @@
> +/*
> + * Copyright (c) 2023 Institue of Software Chinese Academy of Sciences
> (ISCAS).
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301
> USA
> + */
> +
> +#include 
> +
> +#include "libavutil/attributes.h"
> +#include "libavutil/cpu.h"
> +#include "libavutil/riscv/cpu.h"
> +#include "libavcodec/vc1.h"
> +
> +void ff_vc1_inv_trans_8x8_dc_rvv(uint8_t *dest, ptrdiff_t stride, int16_t
> *block);
> +void ff_vc1_inv_trans_4x8_dc_rvv(uint8_t *dest, ptrdiff_t stride, int16_t
> *block);
> +void ff_vc1_inv_trans_8x4_dc_rvv(uint8_t *dest, ptrdiff_t stride, int16_t
> *block);
> +void ff_vc1_inv_trans_4x4_dc_rvv(uint8_t *dest, ptrdiff_t stride, int16_t
> *block);
> +
> +av_cold void ff_vc1dsp_init_riscv(VC1DSPContext *dsp)
> +{
> +#if HAVE_RVV
> +int flags = av_get_cpu_flags();
> +
> +if (flags & AV_CPU_FLAG_RVV_I64) {
> +dsp->vc1_inv_trans_8x8_dc = ff_vc1_inv_trans_8x8_dc_rvv;
> +dsp->vc1_inv_trans_8x4_dc = ff_vc1_inv_trans_8x4_dc_rvv;
> +}
> +if (flags & AV_CPU_FLAG_RVV_I32) {
> +dsp->vc1_inv_trans_4x8_dc = ff_vc1_inv_trans_4x8_dc_rvv;
> +dsp->vc1_inv_trans_4x4_dc = ff_vc1_inv_trans_4x4_dc_rvv;
> +}

Probably missing VLENB checks.

> +#endif
> +}
> diff --git a/libavcodec/riscv/vc1dsp_rvv.S b/libavcodec/riscv/vc1dsp_rvv.S
> new file mode 100644
> index 00..8a6b27192a
> --- /dev/null
> +++ b/libavcodec/riscv/vc1dsp_rvv.S
> @@ -0,0 +1,123 @@
> +/*
> + * Copyright (c) 2023 Institue of Software Chinese Academy of Sciences
> (ISCAS).
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301
> USA
> + */
> +
> +#include "libavutil/riscv/asm.S"
> +
> +func ff_vc1_inv_trans_8x8_dc_rvv, zve64x
> +lht2, (a2)
> +lit1, 3
> +mul   t2, t2, t1

You can multiply by 3, 5 or 9 with shift-and-add. By 12 with shift-and-add 
then shift, and by 17 with shift then add. You don't need multiplications.

> +addi  t2, t2, 1
> +srai  t2, t2, 1
> +mul   t2, t2, t1
> +addi  t2, t2, 16
> +srai  t2, t2, 5
> +vsetivli  zero, 8, e8, mf2, ta, ma
> +vlse64.v  v0, (a0), a1
> +lit0, 8*8
> +vsetvli   zero, t0, e16, m8, ta, ma
> +vmv.v.x   v8, t2

Do you really need to splat? Can't .vx or .wx be used instead?

> +vsetvli   zero, t0, e8, m4, ta, ma
> +vwaddu.wv v8, v8, v0
> +vsetvli   zero, t0, e16, m8, ta, ma
> +vmax.vx

Re: [FFmpeg-devel] [PATCH] lavc/ac3: add R-V Zbb extract_exponents

2023-12-03 Thread Rémi Denis-Courmont



Le 3 décembre 2023 19:50:18 GMT+02:00, Zhao Zhili  a 
écrit :
>
>
>> On Oct 3, 2023, at 00:47, Rémi Denis-Courmont  wrote:
>> 
>> 
>> diff --git a/libavcodec/riscv/ac3dsp_rvb.S b/libavcodec/riscv/ac3dsp_rvb.S
>> new file mode 100644
>> index 00..48f8bb101e
>> --- /dev/null
>> +++ b/libavcodec/riscv/ac3dsp_rvb.S
>> 
>> +func ff_extract_exponents_rvb, zbb
>> +1:
>> +lw   t0, (a1)
>> +addi a0, a0, 1
>> +neg  t1, t0
>> +addi a1, a1, 4
>> +max  t0, t0, t1
>> +addi a2, a2, -1
>> +clz  t0, t0
>> +addi t0, t0, 24 - __riscv_xlen
>> +sb   t0, -1(a0)
>> +bgtza2, 1b
>> +
>> +ret
>> +endfunc
>> —
>
>Got build failure with clang 14:
>
>:6:21: warning: unknown option, expected 'push', 'pop', 'rvc', 
>'norvc', 'relax' or 'norelax'
>.option arch, +zbb
>^
>src/libavcodec/riscv/ac3dsp_rvb.S:24:1: note: while in macro instantiation
>func ff_extract_exponents_rvb, zbb
>^
>src/libavcodec/riscv/ac3dsp_rvb.S:30:9: error: instruction requires the 
>following: 'Zbb' (Basic Bit-Manipulation)
>max t0, t0, t1
>^
>src/libavcodec/riscv/ac3dsp_rvb.S:32:9: error: instruction requires the 
>following: 'Zbb' (Basic Bit-Manipulation)
>clz t0, t0
>^
>make: *** [/home/quink/work/ffmpeg/ffbuild/common.mak:93: 
>libavcodec/riscv/ac3dsp_rvb.o] Error 1
>make: *** Waiting for unfinished jobs
>:6:21: warning: unknown option, expected 'push', 'pop', 'rvc', 
>'norvc', 'relax' or 'norelax'
>.option arch, +f
>^
>src/libavcodec/riscv/audiodsp_rvf.S:23:1: note: while in macro instantiation
>func ff_vector_clipf_rvf, f
>^
>
>Someone says clang 14 has Zbb extensions support. I don’t know what’s going on.

It's not practical to support such a broken assembler as LLVM built-in's until 
they get their act together. You can add tests in FFmpeg configure but that's 
just going to turn all optimisations off. You could also disable the integrated 
assembler and use binutils, but then you'll hit the limitation of FFmpeg's 
configure whereby it tests the inline assembler rather than the outline one.

So really you're better off with GCC. RISC-V support on LLVM is pretty sad, TBH.
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH] lavc/ac3: add R-V Zbb extract_exponents

2023-12-03 Thread Rémi Denis-Courmont



Le 3 décembre 2023 19:50:18 GMT+02:00, Zhao Zhili  a 
écrit :
>
>
>> On Oct 3, 2023, at 00:47, Rémi Denis-Courmont  wrote:
>> 
>> 
>> diff --git a/libavcodec/riscv/ac3dsp_rvb.S b/libavcodec/riscv/ac3dsp_rvb.S
>> new file mode 100644
>> index 00..48f8bb101e
>> --- /dev/null
>> +++ b/libavcodec/riscv/ac3dsp_rvb.S
>> 
>> +func ff_extract_exponents_rvb, zbb
>> +1:
>> +lw   t0, (a1)
>> +addi a0, a0, 1
>> +neg  t1, t0
>> +addi a1, a1, 4
>> +max  t0, t0, t1
>> +addi a2, a2, -1
>> +clz  t0, t0
>> +addi t0, t0, 24 - __riscv_xlen
>> +sb   t0, -1(a0)
>> +bgtza2, 1b
>> +
>> +ret
>> +endfunc
>> —
>
>Got build failure with clang 14:
>
>:6:21: warning: unknown option, expected 'push', 'pop', 'rvc', 
>'norvc', 'relax' or 'norelax'
>.option arch, +zbb
>^
>src/libavcodec/riscv/ac3dsp_rvb.S:24:1: note: while in macro instantiation
>func ff_extract_exponents_rvb, zbb
>^
>src/libavcodec/riscv/ac3dsp_rvb.S:30:9: error: instruction requires the 
>following: 'Zbb' (Basic Bit-Manipulation)
>max t0, t0, t1
>^
>src/libavcodec/riscv/ac3dsp_rvb.S:32:9: error: instruction requires the 
>following: 'Zbb' (Basic Bit-Manipulation)
>clz t0, t0
>^
>make: *** [/home/quink/work/ffmpeg/ffbuild/common.mak:93: 
>libavcodec/riscv/ac3dsp_rvb.o] Error 1
>make: *** Waiting for unfinished jobs
>:6:21: warning: unknown option, expected 'push', 'pop', 'rvc', 
>'norvc', 'relax' or 'norelax'
>.option arch, +f
>^
>src/libavcodec/riscv/audiodsp_rvf.S:23:1: note: while in macro instantiation
>func ff_vector_clipf_rvf, f
>^
>
>Someone says clang 14 has Zbb extensions support. I don’t know what’s going on.

Forgot the explanation: the problem seems to be the arch option, not the Zbb 
extension.

I think you can get around it with -no-integrated-as, but then... what I said 
in the other mail.

Br,
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH v2 7/7] avcodec: add AV_CODEC_FLAG_CLEAR

2023-12-07 Thread Rémi Denis-Courmont

Hi,

Le 8 décembre 2023 00:47:13 GMT+02:00, Marton Balint  a écrit :
>
>
>On Thu, 7 Dec 2023, Anton Khirnov wrote:
>
>> Quoting Ronald S. Bultje (2023-12-07 02:44:36)
>>> Hi,
>>> 
>>> On Wed, Dec 6, 2023 at 3:23 AM Marton Balint  wrote:
>>> 
>>> > Signed-off-by: Marton Balint 
>>> > ---
>>> >  doc/APIchanges |  3 +++
>>> >  doc/codecs.texi| 14 ++
>>> >  libavcodec/avcodec.h   |  4 
>>> >  libavcodec/decode.c|  6 ++
>>> >  libavcodec/options_table.h |  1 +
>>> >  libavcodec/version.h   |  2 +-
>>> >  6 files changed, 29 insertions(+), 1 deletion(-)
>>> >
>>> > diff --git a/doc/APIchanges b/doc/APIchanges
>>> > index 416e2bec5e..f839504a64 100644
>>> > --- a/doc/APIchanges
>>> > +++ b/doc/APIchanges
>>> > @@ -2,6 +2,9 @@ The last version increases of all libraries were on
>>> > 2023-02-09
>>> >
>>> >  API changes, most recent first:
>>> >
>>> > +2023-12-xx - xxx - lavc 60.36.100 - avcodec.h
>>> > +  Add AV_CODEC_FLAG_CLEAR.
>>> > +
>>> >  2023-12-xx - xxx - lavu 58.33.100 - imgutils.h
>>> >Add av_image_fill_color()
>>> >
>>> > diff --git a/doc/codecs.texi b/doc/codecs.texi
>>> > index 5b950b4560..0504a535f2 100644
>>> > --- a/doc/codecs.texi
>>> > +++ b/doc/codecs.texi
>>> > @@ -76,6 +76,20 @@ Apply interlaced motion estimation.
>>> >  Use closed gop.
>>> >  @item output_corrupt
>>> >  Output even potentially corrupted frames.
>>> > +@item clear
>>> > +Clear the contents of the video buffer before decoding the next picture
>>> > to it.
>>> > +
>>> > +Usually if only a part of a picture is affected by a decode error then 
>>> > the
>>> > +decoder (if it implements error concealment) tries to hide it by
>>> > interpolating
>>> > +pixels from neighbouring areas or in some cases from the previous frame.
>>> > Even
>>> > +without error concealment it is quite likely that the affected area will
>>> > +contain pixels from an earlier frame, due to frame pooling.
>>> >
>>> 
>>> No comment on the patch itself, but wouldn't our users (and the C standard
>>> itself) consider it a security issue to return stale
>> 
>> I don't see the security issue in returning previously-returned frame
>> data.
>
>I guess what Ronald means that it is possible that the decoder frame pool 
>allocates data in heap previously containing sensitive data, and that might 
>never get overwritten in case of faulty input before passing it to the user.
>
>The simple fix for that is to clear frame pool buffers on creation?
>
>I am not sure if it is actually UB to read uninitialzied data from the heap 
>though.

Reading uninitialised data is UB if the type representation is not surjective 
(e.g. bool, and potentially compound types with padding). Of course there are 
all sorts of other problems that could indirectly cause UB such as implicitly 
assuming that an integer fits a certain range and triggering an undefined 
overflow otherwise.

>
>Regards,
>Marton
>___
>ffmpeg-devel mailing list
>ffmpeg-devel@ffmpeg.org
>https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
>To unsubscribe, visit link above, or email
>ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH] lavc/lpc: R-V V apply_welch_window

2023-12-08 Thread Rémi Denis-Courmont

apply_welch_window_even_c:   617.5
apply_welch_window_even_rvv_f64: 235.0
apply_welch_window_odd_c:709.0
apply_welch_window_odd_rvv_f64:  256.5
---
 libavcodec/lpc.c|  4 +-
 libavcodec/lpc.h|  1 +
 libavcodec/riscv/Makefile   |  2 +
 libavcodec/riscv/lpc_init.c | 37 
 libavcodec/riscv/lpc_rvv.S  | 88 +
 5 files changed, 131 insertions(+), 1 deletion(-)
 create mode 100644 libavcodec/riscv/lpc_init.c
 create mode 100644 libavcodec/riscv/lpc_rvv.S

diff --git a/libavcodec/lpc.c b/libavcodec/lpc.c
index dc6a3060ce..9e2fd0f128 100644
--- a/libavcodec/lpc.c
+++ b/libavcodec/lpc.c
@@ -320,7 +320,9 @@ av_cold int ff_lpc_init(LPCContext *s, int blocksize, int 
max_order,
 s->lpc_apply_welch_window = lpc_apply_welch_window_c;
 s->lpc_compute_autocorr   = lpc_compute_autocorr_c;
 
-#if ARCH_X86
+#if ARCH_RISCV
+ff_lpc_init_riscv(s);
+#elif ARCH_X86
 ff_lpc_init_x86(s);
 #endif
 
diff --git a/libavcodec/lpc.h b/libavcodec/lpc.h
index 467d0b2830..0200baea5c 100644
--- a/libavcodec/lpc.h
+++ b/libavcodec/lpc.h
@@ -109,6 +109,7 @@ double ff_lpc_calc_ref_coefs_f(LPCContext *s, const float 
*samples, int len,
  */
 int ff_lpc_init(LPCContext *s, int blocksize, int max_order,
 enum FFLPCType lpc_type);
+void ff_lpc_init_riscv(LPCContext *s);
 void ff_lpc_init_x86(LPCContext *s);
 
 /**
diff --git a/libavcodec/riscv/Makefile b/libavcodec/riscv/Makefile
index e9825c0856..1d4572fbc5 100644
--- a/libavcodec/riscv/Makefile
+++ b/libavcodec/riscv/Makefile
@@ -33,6 +33,8 @@ OBJS-$(CONFIG_LLVIDDSP) += riscv/llviddsp_init.o
 RVV-OBJS-$(CONFIG_LLVIDDSP) += riscv/llviddsp_rvv.o
 OBJS-$(CONFIG_LLVIDENCDSP) += riscv/llvidencdsp_init.o
 RVV-OBJS-$(CONFIG_LLVIDENCDSP) += riscv/llvidencdsp_rvv.o
+OBJS-$(CONFIG_LPC) += riscv/lpc_init.o
+RVV-OBJS-$(CONFIG_LPC) += riscv/lpc_rvv.o
 OBJS-$(CONFIG_OPUS_DECODER) += riscv/opusdsp_init.o
 RVV-OBJS-$(CONFIG_OPUS_DECODER) += riscv/opusdsp_rvv.o
 OBJS-$(CONFIG_PIXBLOCKDSP) += riscv/pixblockdsp_init.o
diff --git a/libavcodec/riscv/lpc_init.c b/libavcodec/riscv/lpc_init.c
new file mode 100644
index 00..c16e5745f0
--- /dev/null
+++ b/libavcodec/riscv/lpc_init.c
@@ -0,0 +1,37 @@
+/*
+ * Copyright © 2022 Rémi Denis-Courmont.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavcodec/lpc.h"
+
+void ff_lpc_apply_welch_window_rvv(const int32_t *, ptrdiff_t, double *);
+
+av_cold void ff_lpc_init_riscv(LPCContext *c)
+{
+#if HAVE_RVV && (__riscv_xlen >= 64)
+int flags = av_get_cpu_flags();
+
+if ((flags & AV_CPU_FLAG_RVV_F64) && (flags & AV_CPU_FLAG_RVB_ADDR))
+c->lpc_apply_welch_window = ff_lpc_apply_welch_window_rvv;
+#endif
+}
diff --git a/libavcodec/riscv/lpc_rvv.S b/libavcodec/riscv/lpc_rvv.S
new file mode 100644
index 00..2bc729d400
--- /dev/null
+++ b/libavcodec/riscv/lpc_rvv.S
@@ -0,0 +1,88 @@
+/*
+ * Copyright © 2023 Rémi Denis-Courmont.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/riscv/asm.S"
+
+#if __riscv_xlen >= 64
+func ff_lpc_apply_welch_window_rvv, zve64d
+vsetvli t0, zero, e64, m8, ta, ma
+vid.v   v0
+addit2, a1, -1
+vfcvt.f.xu.v v0, v0
+li  t3, 2
+fcvt.d.l ft2, t2
+srait1, a1, 1
+fcvt.d.l ft3, t3
+li  t4,

Re: [FFmpeg-devel] [PATCH] checkasm: add test for dcmul_add

2023-11-27 Thread Rémi Denis-Courmont



Le 26 novembre 2023 22:54:28 GMT+02:00, flow gg  a écrit :
>This is a bit confusing for me.. I tried pulling the latest code, and then
>used `git am checkasm-test-for-dcmul_add.patch` without any patch
>corruption.

Did you try with the actual sent email or only with the original patch file? 
___
>> ffmpeg-devel mailing list
>> ffmpeg-devel@ffmpeg.org
>> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>>
>> To unsubscribe, visit link above, or email
>> ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
>>
>___
>ffmpeg-devel mailing list
>ffmpeg-devel@ffmpeg.org
>https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
>To unsubscribe, visit link above, or email
>ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH] fate.sh: Allow overriding what targets to make for running the tests

2023-11-27 Thread Rémi Denis-Courmont

Le maanantaina 27. marraskuuta 2023, 14.31.18 EET Martin Storsjö a écrit :
> This can be useful if doing testing of uncommon CPU extensions by
> running tests with QEMU (by configuring with e.g.
> "target_exec=qemu-aarch64"), by only running the checkasm tests,
> to get a reasonable test coverage without excessive test runtime.

For the purpose of testing future or bleeding-edge CPU extensions on emulator, 
you would normally want to be able to actually filter those in. That is more of 
a matter of patching checkasm than FATE.

Considering the poor coverage of checkasm, I fear that this just gives the 
wrong impression, not to say a false sense of security. It feels misleading to 
encourage or support that paradigm into FATE, in light of that poor coverage. 
Afterall, if it's just about running checkasm, anybody can just run
`make tests/checkasm/checkasm && tests/checkasm/checkasm`.

Either way, this feels like a case of cart before horse.

Also FWIW, RV broke due to misaligned accesses and illegal vector types that 
QEMU tolerated. That is rather an argument against QEMU than against this MR 
but still.

-- 
Rémi Denis-Courmont
http://www.remlab.net/



___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH] fate.sh: Allow overriding what targets to make for running the tests

2023-11-27 Thread Rémi Denis-Courmont

Le 28 novembre 2023 01:22:14 GMT+02:00, Michael Niedermayer 
 a écrit :
>On Mon, Nov 27, 2023 at 05:46:40PM +0200, Rémi Denis-Courmont wrote:
>[...]
>> Also FWIW, RV broke due to misaligned accesses and illegal vector types that
>> QEMU tolerated. That is rather an argument against QEMU than against this MR 
>> but still.
>
>has someone reported this to qemu ?
>(seems like a bug)

It's not a bug. The specification leaves those cases *undefined*. QEMU supports 
them because they can, and adding sanity checks would just slow stuff down.

Also generally QEMU TCG policy seems to be maximize perf and compatibility, not 
formal correctness.

>
>thx
>
>[...]
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH] MAINTAINERS: remove myself from FFmpeg

2023-12-07 Thread Rémi Denis-Courmont

Le torstaina 7. joulukuuta 2023, 10.59.06 EET Nicolas George a écrit :
> Jean-Baptiste Kempf (12023-12-07):
> > Why?
> 
> Because after twelve years libav has finally managed to take control and
> FFmpeg is now essentially dead.

The question was for Paul. Even if you take Anton's knee-jerk threats of 
reverts as LibAV-think, they were but the last straw (Paul wrote as much).

You have had heated arguments against Paul in recent times too. You have also 
argued a lot of exercising your review privileges, which sounds like a very 
libavish notion to me - a LibAV notion that made into written down FFmpeg 
project rules. As a matter of fact, regardless of who was right or wrong, and 
whence, I can only _observe_ that Paul did complain specifically about you on 
the IRC channel. To be fair, he also abundantly abused JB there, although I do 
not know how much of it was sarcasm vs actual attack.

But in any case, by that same logic, you could also be "thank[ed] for your 
contribution in this". In my opinion, this would be unfair to you, and 
accordingly, you are being unfair to whomever you designate by "libav".

(For the sake of utmost clarity, I am not so vain as to consider myself a part 
of the former project known as libav with however few contributions I made 
thereto.)

> Thank you for your contribution in this.

@CC: Yes, that second sentence can be construed as an ad hominem against 
Anton. Feel free to ban me.

-- 
雷米‧德尼-库尔蒙
http://www.remlab.net/



___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH] fate.sh: Allow overriding what targets to make for running the tests

2023-11-30 Thread Rémi Denis-Courmont



Le 27 novembre 2023 23:55:18 GMT+02:00, "Martin Storsjö"  a 
écrit :
>On Mon, 27 Nov 2023, Rémi Denis-Courmont wrote:
>
>> Le maanantaina 27. marraskuuta 2023, 14.31.18 EET Martin Storsjö a écrit :
>>> This can be useful if doing testing of uncommon CPU extensions by
>>> running tests with QEMU (by configuring with e.g.
>>> "target_exec=qemu-aarch64"), by only running the checkasm tests,
>>> to get a reasonable test coverage without excessive test runtime.
>> 
>> For the purpose of testing future or bleeding-edge CPU extensions on 
>> emulator, you would normally want to be able to actually filter those in. 
>> That is more of a matter of patching checkasm than FATE.
>
>Sorry, can you elaborate on what you mean with "filter those in" here?

You're running all checkasm tests, not just those that require the emulator.

But what's potentially much worse is that you're triggering a whole build, or 
it's not entirely clear from the description how you'd reuse an existing build.

For Armv8, that's just bad. For RV, that's terrible, as we need to run the same 
checkasm with different emulator configuration (different $QEMU_CPU in the case 
of QEMU): one per vector length. Armv9 will potentially have the same problem 
if FFmpeg grows SVE(2) support.

>
>> Considering the poor coverage of checkasm, I fear that this just gives the 
>> wrong impression, not to say a false sense of security. It feels misleading 
>> to encourage or support that paradigm into FATE, in light of that poor 
>> coverage. Afterall, if it's just about running checkasm, anybody can just 
>> run `make tests/checkasm/checkasm && tests/checkasm/checkasm`.
>
>Yes, anybody can run that - but having those results posted continuously 
>somewhere where other can see them can be valuable as well.
>
>Anyway, the concrete case I'm considering, is that we've got AArch64 code 
>merged, that uses the I8MM extensions. We don't have any FATE configuration 
>that continuously test that. Whenever there are patches, I do spin up a cloud 
>instance that supports this extension and test the patches there, but 
>inbetween that we're pretty much blind.
>
>While checkasm's coverage isn't fantastic, for this particular case I'm not 
>merging any AArch64 code for new extensions unless that code is covered by 
>checkasm.
>
>The other AArch64 feature that we do have code for, which also is untested, is 
>the assembly support for branch protection and pointer authentication. Also 
>this is testable pretty easily with QEMU. It's of course more interesting to 
>run the full fate suite, but if we're not looking for bugs in the compiler but 
>only for bugs in our assembly, then checkasm should cover most of it.
>
>Yes there's potential for QEMU bugs hiding real issues, but I'd rather have a 
>regular run of QEMU+checkasm than not have it tested at all. And I'm 
>volunteering HW+time for testing these cases with QEMU for whatever checkasm 
>covers, but I'm not volunteering it for full fate runs with QEMU.
>
>And sure, I can just run such configs privately, as I already do run a bunch 
>of various regular builds for projects I care about - but as we do have FATE 
>with the public status page, hooking it up to be reported there would feel 
>like added value for everybody.
>
>// Martin
>___
>ffmpeg-devel mailing list
>ffmpeg-devel@ffmpeg.org
>https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
>To unsubscribe, visit link above, or email
>ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH] fate.sh: Allow overriding what targets to make for running the tests

2023-11-30 Thread Rémi Denis-Courmont

Le torstaina 30. marraskuuta 2023, 17.34.31 EET Martin Storsjö a écrit :
> Yeah, I wouldn't reuse an existing build here. For the setup I have in
> mind, one build doesn't take too horribly long (either on an old desktop
> x86 machine, or a moderate aarch64 server) - so it's not ideal but not a
> dealbreaker anyway (while running all of fate with qemu takes one
> magnitude longer).

Well it's pretty much a deal breaker for Armv9 and RV. I can understand 
wanting to build on a comfy x86 server, but doing different builds just to 
change QEMU CPU flags is IMO inept.

Sure, we could just build once and run several times checkasm with a separate 
script, as I already pointed out. But then this patch is completely 
unnecessary.

> For the other setup I intended to test, to test AArch64 PAC and BTI, I
> would do a separate build with -mbranch-protection=standard anyway.

That does not make much sense to me. PAC and BTI should be enabled by default 
in compatibility mode (for ARMv8.0-8.2 builds) or noncompatibility mode (for 
ARMv8.3+ builds). The resulting code should be tested with and without PAC and 
with and without BTI.

Separate builds only might make sense if you want to do something more fancy 
with PAC, requiring the non-HINT instructions, but then that is beyond 
"standard" branch protection. For BTI, there are no reasons whatsoever to make 
separate builds; it's a literal waste of time and energy.

-- 
レミ・デニ-クールモン
http://www.remlab.net/



___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH 1/2] checkasm: test for abs_pow34

2023-11-30 Thread Rémi Denis-Courmont

Le tiistaina 28. marraskuuta 2023, 18.59.38 EET flow gg a écrit :
> 

Since nobody else commented, I shall note that you should probably split the 
underlying lavc changes into a separate preliminary patch.

-- 
レミ・デニ-クールモン
http://www.remlab.net/



___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH] fate.sh: Allow overriding what targets to make for running the tests

2023-11-30 Thread Rémi Denis-Courmont

Le tiistaina 28. marraskuuta 2023, 16.21.55 EET Michael Niedermayer a écrit :
> On Tue, Nov 28, 2023 at 09:27:08AM +0200, Rémi Denis-Courmont wrote:
> > Le 28 novembre 2023 01:22:14 GMT+02:00, Michael Niedermayer 
 a écrit :
> > >On Mon, Nov 27, 2023 at 05:46:40PM +0200, Rémi Denis-Courmont wrote:
> > >[...]
> > >
> > >> Also FWIW, RV broke due to misaligned accesses and illegal vector types
> > >> that QEMU tolerated. That is rather an argument against QEMU than
> > >> against this MR but still.
> > >
> > >has someone reported this to qemu ?
> > >(seems like a bug)
> > 
> > It's not a bug. The specification leaves those cases *undefined*. QEMU
> > supports them because they can, and adding sanity checks would just slow
> > stuff down.
> > 
> > Also generally QEMU TCG policy seems to be maximize perf and
> > compatibility, not formal correctness.
> I think i read somewhere that recent qemu supposedly checks alignment on arm
> more completely. But i couldnt quickly find a official statement about that

As of 8.2.0-rc2, it most definitely does not:

8<
static inline void gen_check_sp_alignment(DisasContext *s)
{
/* The AArch64 architecture mandates that (if enabled via PSTATE
 * or SCTLR bits) there is a check that SP is 16-aligned on every
 * SP-relative load or store (with an exception generated if it is not).
 * In line with general QEMU practice regarding misaligned accesses,
 * we omit these checks for the sake of guest program performance.
 * This function is provided as a hook so we can more easily add these
 * checks in future (possibly as a "favour catching guest program bugs
 * over speed" user selectable option).
 */ 
}  
>8

And this is an actual violation of the specification. In the RISC-V case, QEMU 
is not even violating the specification, just making a different choice than 
the 
only one currently commercially available hardware implementation.

> But either way, qemu could emit such code optionally when it is used for
> testing. Which is one of the things people use qemu for.

That would be very true for system mode "soft-MMU" QEMU, but much more 
questionable for user mode. In any case, I don't make their policies.

> So IMHO it would make sense for qemu to detect cases that are undefined
> even if for no other reason than to emulate the hw more exactly.

I would agree that optional flags would be sensible. But TBH, we don't even yet 
know how the IPs from other vendors than Alibaba/T-Head will behave.

> If this is not done, qemu can be detected and code could refuse or
> fail to run

-- 
レミ・デニ-クールモン
http://www.remlab.net/



___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH] fate.sh: Allow overriding what targets to make for running the tests

2023-11-30 Thread Rémi Denis-Courmont

Le torstaina 30. marraskuuta 2023, 18.28.39 EET Martin Storsjö a écrit :
> On Thu, 30 Nov 2023, Rémi Denis-Courmont wrote:
> > Le torstaina 30. marraskuuta 2023, 17.34.31 EET Martin Storsjö a écrit :
> >> Yeah, I wouldn't reuse an existing build here. For the setup I have in
> >> mind, one build doesn't take too horribly long (either on an old desktop
> >> x86 machine, or a moderate aarch64 server) - so it's not ideal but not a
> >> dealbreaker anyway (while running all of fate with qemu takes one
> >> magnitude longer).
> > 
> > Well it's pretty much a deal breaker for Armv9 and RV. I can understand
> > wanting to build on a comfy x86 server, but doing different builds just to
> > change QEMU CPU flags is IMO inept.
> 
> Yes. But for doing one single run with QEMU, I don't mind.

You can already test it properly as things stand, and reporting is trivial, 
just not to the FATE website. The question is whether this is worth adding to 
FATE. In other words, is publishing on the FATE website worth making the tests 
coverage and/or the build time worse? not to mention confusing the existing 
website users with weirdly incomplete test results.

> Again, for SVE, I'd rather have testing with 1 config (the default, which
> is longer vectors than one usually encounters in HW) rather than none at
> all. It won't catch every theoretical issue but practically would catch
> many things at least.

I find that statement very misleading. This is not a question of testing 1 
config vs 0. It's a question of testing 1 configuration vs all of them(*), and 
reporting that one vs reporting all of them elsewhere than FATE.ffmpeg.org. 
Until/unless somebody does the missing integration.

(*) at least those that QEMU supports

> Are you volunteering to write FATE integration to run checkasm multiple 
> times with different QEMU settings, so I can wait for that instead of 
> having much improved public test coverage right now?

Of course I will not volunteer, given that the RISE project already has an 
outstanding RfP which will likely require this done professionally:
https://hubs.la/Q029hwpS0
(That does not mean that I would have volunteered otherwise, just that the 
question is moot as far as I am concerned and for the time being.)

> > Sure, we could just build once and run several times checkasm with a
> > separate script, as I already pointed out. But then this patch is
> > completely unnecessary.
> 
> Indeed, that's trivial to do for a private testing setup.
> 
> >> For the other setup I intended to test, to test AArch64 PAC and BTI, I
> >> would do a separate build with -mbranch-protection=standard anyway.
> > 
> > That does not make much sense to me. PAC and BTI should be enabled by
> > default in compatibility mode (for ARMv8.0-8.2 builds) or
> > noncompatibility mode (for ARMv8.3+ builds).
> 
> Maybe it should - but it currently isn't.

That's really up to whoever set up the AArch64 builds to fix their build flags 
TBH (I believe that the assembler is already sorted). And at least for PAuth, 
that should be sufficient, as support from the C runtime is not required.

> And in order to actually test BTI, one has to link with a sysroot that
> also was built with BTI enabled - I currently use a sysroot extracted from
> fedora for that. (And my tests for it use -Wl,-z,force-bti.)

I can readily believe how much of a PITA that would be to set up. I can also 
believe that glibc won't allow masking the guarded page bit in mmap()/
mprotect().

That does not mean you need different builds to test each of the 4 possible 
combinations (or 3 if you ignore the case of BTI without PAC, which does not 
exist in real hardware). Once you have that build, you can test it with 
whichever QEMU CPU settings. Surely Fedora, of all distros, is not going to 
treat Armv8.5-BTI as a distinct arch from AArch64 whilst Arm made sure it was 
both backward-compatible and runtime-tunable.

-- 
レミ・デニ-クールモン
http://www.remlab.net/

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH] lavc/vc1dsp: R-V V inv_trans

2023-12-04 Thread Rémi Denis-Courmont

Le maanantaina 4. joulukuuta 2023, 10.48.56 EET flow gg a écrit :
> > Probably missing VLENB checks.
> 
> Changed.
> 
> > You can multiply by 3, 5 or 9 with shift-and-add. By 12 with shift-and-add
> > then shift, and by 17 with shift then add. You don't need multiplications.
> 
> Changed.
> 
> > Do you really need to splat? Can't .vx or .wx be used instead?
> 
> Okay, for example in ff_vc1_inv_trans_8x8_dc_rvv
> 
> + vsetvli  zero, t0, e8, m2, ta, ma
> + vwaddu.vxv4, v0, zero
> + vsetvli  zero, t0, e16, m4, ta, ma
> + vadd.vx  v4, v4, t2
> - vsetvli  zero, t0, e16, m4, ta, ma
> - vmv.v.x  v4, t2
> - vsetvli  zero, t0, e8, m2, ta, ma
> - vwaddu.wvv4, v4, v0
> 
> But the speed has slowed down slightly on the c910,
> I'm not sure if I should modify it.

OK, unfortunately, there is no widening addition with wide scalar operand. But 
you can do zero-extension then addition here. In the end, I doubt that you can 
reasonably optimise whilst working with a C910-based board. This function 
deviates too much on non-conformant hardware.

-- 
レミ・デニ-クールモン
http://www.remlab.net/



___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] Sovereign Tech Fund

2024-01-28 Thread Rémi Denis-Courmont

Le sunnuntaina 28. tammikuuta 2024, 5.25.49 EET Michael Niedermayer a écrit :
> Please read the following to get a better understanding what STF is about:
> (In short it is about maintenance and sustainability, not features)
> https://www.sovereigntechfund.de/programs/applications
> 
> As some probably already know, Thilo has worked with STF to work out
> many details of this. SPI will handle the financials for FFmpeg.

As anybody who's been following FFmpeg-devel knows, people have pointed out 
SPI seems like a poor choice of vehicle for that sort of commission. I won't 
repeat the arguments that were already made in the second half of last year.

But I will add a few comments...

> Everyone willing to benefit from this sponsorship must not be a US sanctioned
> entity or in a US sanctioned country. 

In other words, the choice of a US vehicle is excluding people who are, or 
fear that they may be affected by US sanctions. Some active developers are 
associated with, for example, the Chinese Academy of Science, Huawei 
Technologies or other Chinese IT R entites. This is discriminatory, and thus 
something that an open-source project should actively seek to *avoid*.

German government funding should go to German or at least EU-based entities if 
only for that reason. In other words, by going through SPI, Thilo is 
*unnecessarily* bringing ugly politics into an open-source project. (And 
please don't shoot the messenger here.)

> At this point, what we need is a list of Projects so we can submit an
> application to STF at or before 12th Feb. (at the 14th they have a meeting
> and will review our submission) What STF told us, they need ATM is:

The "selection criteria" seem rather restrictive. It seems that critical tasks 
such as long-term maintainance (Anton) and security fixes (you) are in scope. 
Though I can only agree with Kieran that SoW is ill-suited for tasks of the 
sort. If SPI insists on SoW, which is somewhat understandable from their legal 
and moral standpoint, then that is another reason why SPI should not, or 
maybe, cannot, be the vehicle.

By stretching the criteria a little, maybe reasonably expected external or 
normative updates are also in scope, like say implementing optimisations for 
new ISA extensions or new codec profiles. But implementing entirely new 
features seems unambiguously excluded, especially if competing with existing 
open-source projects. Prototypes are also *explicitly* excluded. So for the 
sake of the argument, reimplementing X264, dav1d or GNU/radio functionality in 
FFmpeg seems like it would not qualify.

I am not a lawyer, but there may be nontrivial legal implications for SPI and 
the contractees here. Note that I do not mean to argue against the 
restrictions here. They make perfect sense considering that this funding would 
ultimately come from the German tax payers.

(...)

> My suggestion is that we create a Trac WIKI page similar to the ideas
> page for GSoC.
> On that page everyone can add a project idea.
> The requirement is that
> 1. it must fit in the goals and mission and all of STF
> 2. it must be about FFmpeg (IIUC non coding tasks are ok)

IIUC, they are *not* OK, unless they are a dependency of a coding task:

| Development is our primary focus, although security audits, conference
| attendance, and other community-based events can be included in the
| application should they be necessary.

-- 
レミ・デニ-クールモン
http://www.remlab.net/



___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH 2/4] lavc/rv34dsp: R-V V rv34_inv_transform_dc

2024-01-31 Thread Rémi Denis-Courmont

Hi,

I think this breaks the build for RV32, and it lacks checks for the vector 
length.

Also fractional multipler should never be smaller than the ratio of the 
specified element size to the largest element size used in the function. Here 
it is largelly inconsequential, but for instance "e32, mf4" and "e64, mf2" are 
invalid.

-- 
雷米‧德尼-库尔蒙
http://www.remlab.net/



___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] Subject: [PATCH 3/3] lavc/dnxhdenc: R-V V get_pixels_8x4_sym

2024-01-29 Thread Rémi Denis-Courmont

Hi,

+/*
+ * Copyright (c) 2023 Institue of Software Chinese Academy of Sciences 
(ISCAS).
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 
USA
+ */
+
+#include "libavutil/riscv/asm.S"
+
+func ff_get_pixels_8x4_sym_rvv, zve64x
+vsetivlizero, 8, e8, mf2, ta, ma
+vlse64.vv16, (a1), a2
+li  t0, 8 * 8
+vsetvli zero, t0, e16, m4, ta, ma
+vzext.vf2   v8, v16
+vse16.v v8, (a0)
+li  a2, 8*2

That's not needed. You can use immediate values.

+vsetivlizero, 2, e8, mf8, ta, ma
+addia1, a0, 48
+addia0, a0, 32*2
+vle64.v v0, (a1)
+vse64.v v0, (a0)
+sub a1, a1, a2
+vle64.v v0, (a1)
+add a0, a0, a2
+vse64.v v0, (a0)
+sub a1, a1, a2
+vle64.v v0, (a1)
+add a0, a0, a2
+vse64.v v0, (a0)
+sub a1, a1, a2
+vle64.v v0, (a1)
+add a0, a0, a2
+vse64.v v0, (a0)

You can reorder to avoid immediate data dependencies on the addresses.

I expect that it would be faster to make one large load, and then 4 small 
stores, but that might work only for exactly 128-bit vectors?

In any case, you need to check the vector length in init.

+
+ret
+endfunc

-- 
雷米‧德尼-库尔蒙
http://www.remlab.net/



___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] Sovereign Tech Fund

2024-01-29 Thread Rémi Denis-Courmont

Le maanantaina 29. tammikuuta 2024, 19.27.14 EET Michael Niedermayer a écrit :
> Also FFmpeg has been part of Google summer of code for many many years
> and also in the past in outreachy. All these projects payed "students"
> for work they did.
> From a legal point of view, these are probably very similar
> 
> Mysteriously, there was a total absence of similar drama there.
> I wonder how it could have been possible to do that for over a decade
> with not one instance of drama or problems like here.

Google funding GSoC students to work on FFmpeg. And nobody objected agains the 
core idea of STF funding developers to work on FFmpeg.

The "drama" is about how and through whom the funding goes. That drama 
couldn't be had for GSoC because how was however Google decides, and there was 
no intermediary to go through (money went straight from Google to the 
students).

-- 
レミ・デニ-クールモン
http://www.remlab.net/



___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] Sovereign Tech Fund

2024-01-29 Thread Rémi Denis-Courmont

Le maanantaina 29. tammikuuta 2024, 20.11.19 EET Michael Niedermayer a écrit :
> > The "drama" is about how and through whom the funding goes.
> 
> ok, elaborate please
> 
> All FFmpeg money has always been handled through SPI or associated entities

It was already a bit of a stretch to compare GSoC students with (hypothetical) 
STF subcontractors. So sorry but I simply don't think that the funding for 
mentors is comparable at all. In fact, it seems completely normal for the GSoC 
mentor funding to go via open-source foundations, and other GSoC projects 
presumably operate the same way.

> Its under the control of the community and its transparent

You always have the control of the community at the time of review and merge.

You can argue all you want that more open is better. What I see is that this 
more open is already turning into a train wreck (as predicted last year).

> And very important what do you propose ?

We already went through this in the previous thread last year. This is not 
going to work in the light of what Jonatas politely calls FFmpeg "governance" 
challenges. It was already clear that finding agreement within the GA would be 
at best very difficult and untimely.

People (including myself) already suggested to arrange that sort of things via 
an IT service company (*not* necessarily FFlabs). Or you could even go through 
a "porting" company in your country if you can't find an existing agreeable 
company and don't want to register your own. Of course those are not perfect 
solutions but they seem far less fraught with problems than going through a 
foundation, especially a US-based foundation. You can review the archives for 
details.

And it certainly does not help that this only became public so late in the 
process, which is intrinsically suspicious.

> Should we reject the maybe 200k € grant we could get from STF now ?

Again, nobody objected to getting funding from STF as such.

> > That drama
> > couldn't be had for GSoC because how was however Google decides, and there
> > was no intermediary to go through (money went straight from Google to the
> > students).
> 
> SPI handles all the GSoC mentor money.
> And lets just assume it would handle the students money too, what difference
> would that really make ?

It would cause similar arguments to this one. And that's if Google would even 
agree to such a setup (which I guess they wouldn't).

What is the point of going through SPI for *this* (as opposed to regular 
donations)?

-- 
レミ・デニ-クールモン
http://www.remlab.net/



___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] Sovereign Tech Fund

2024-02-01 Thread Rémi Denis-Courmont

Le torstaina 1. helmikuuta 2024, 19.59.14 EET Anton Khirnov a écrit :
> > Why should i suddenly do something different ?
> > I did it for 100% free back then
> > and here it wouldnt even make sense, closing false positives also
> > counts as resolved. Its less work even to get 70USD ;)
> 
> What's with this hurt-feelings tone? You ASKED people to comment on the
> proposals, so I asked a question. You can just answer it, no need to get
> all emotional about it. I don't stalk you or your commits, why do you
> expect me to know that you worked on such issues "long ago"? I don't
> even know one can close coverity issues manually.
> 
> What I do know is that I've seen similar initiatives run into this
> pathology in the past, hence my question.

Yeah, well there are two sides to this issue.

The obvious one is that it reviewing code takes time and is not exactly the 
most rewarding job. This is especially true for reviewing dull issues like 
Coverity's, but it is generally true.

The lesser obvious flip-side is that somebody should also review the handling 
of Coverity issues, even those that end up marked as "False positive" or 
"Intentional".

This gets even worse if everybody knows that someone else is paid. Then the 
incentive to review on one's free time gets even lower in my experience. I 
don't know how to address that paradox generally speaking, but I do think that 
bug triaging, bug fixing and code review should be paid per hour, not per bug 
report (and I count Coverity issues as a type of bug reports).

This is not just theoretical. I have actually previously worked in an 
organisation that paid contractors per bug as a unit, and of course people 
gamed the system to get paid more with little extra work.

-- 
雷米‧德尼-库尔蒙
http://www.remlab.net/



___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [RFC] Vote STF/SPI 2024-02

2024-02-01 Thread Rémi Denis-Courmont

Le torstaina 1. helmikuuta 2024, 19.45.52 EET Vittorio Giovara a écrit :
> The same of course should apply to any other future funding, it must be
> either the community (via GA) or a third party setting up the sponsorship.

Neither the community or the GA can forbid people from seeking funding for 
themselves. I suppose that, in theory, developers could be required to sign an 
agreement to that effect before they are allowed to submit code for inclusion, 
but that seems neither practical, nor desirable to me.

That is probably not what you meant, but that is what this reads like.


Frankly, if Thilo secures the funding, it's between him and the German 
authorities what they want to spend it on, as long as it remains within the 
boundaries of applicable laws. If he can come with a project to fund Michael 
to maintain FFmpeg for a while, FFmpeg will be no worse off.

Nobody should claim to represent FFmpeg without any kind of preexisting 
delegation to do so. If that was done, then that is very morally wrong. But 
realistically, we cannot enforce that. Some people did it in the past and will 
continue to do it in the future. It is effectively up the other parties to 
perform due diligence and not get fooled - if they even care. STF probably 
does not care; NAB most certainly does not care.

Moreover pretenses of this process being open need to be dropped. It's not 
open if any and all objections are summarily rejected to put it politely. A 
short deadline is not an excuse, even if it was unavoidable. (And I remain 
unconvinced that public discussion could not start earlier than they did.)

Ultimately, whatever comes out of this does not get any special exemption from 
code review standards and TC oversight, but that should be a given. Therefore 
this funding should much preferably be used toward as uncontroversial tasks as 
possible: Maintainance is a good example. SDR is a counter-example.


With that long side note, while I agree with most of what you said otherwise, 
I don't think that there is any merit to excluding Michael from this process, 
doubly so if there are too few viable proposals.

-- 
雷米‧德尼-库尔蒙
http://www.remlab.net/



___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH 2/2] lavc/blockdsp: R-V V clear_blocks

2024-02-01 Thread Rémi Denis-Courmont

You should probably use an assembler macro to repeat the code.


-- 
レミ・デニ-クールモン
http://www.remlab.net/
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH 1/4] checkasm/rv34dsp: add rv34_inv_transform_dc test

2024-02-02 Thread Rémi Denis-Courmont

Le 2 février 2024 01:42:20 GMT+02:00, Michael Niedermayer 
 a écrit :
>On Wed, Jan 31, 2024 at 08:00:18PM +0800, flow gg wrote:
>> 
>
>>  checkasm/Makefile   |1 
>>  checkasm/checkasm.c |3 ++
>>  checkasm/checkasm.h |1 
>>  checkasm/rv34dsp.c  |   65 
>> 
>>  fate/checkasm.mak   |1 
>>  5 files changed, 71 insertions(+)
>> e7eed6e25de9f313ddb3c0f3066f02f0671d3271  
>> 0001-checkasm-rv34dsp-add-rv34_inv_transform_dc-test.patch
>> From 46a81051f49f6b4032815d5f123be8ff614033e2 Mon Sep 17 00:00:00 2001
>> From: sunyuechi 
>> Date: Wed, 31 Jan 2024 19:00:23 +0800
>> Subject: [PATCH 1/4] checkasm/rv34dsp: add rv34_inv_transform_dc test
>
>seems to fail here

Do you mean that the test is wrong or that it exposes a bug in the x86 
optimisations (which wouldn't be the first occurrence)?

It's painful enough that RVV optimisations need to add checkasm tests for 
existing code. We can't be expected to fix x86 bugs on top.
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] Sovereign Tech Fund

2024-02-04 Thread Rémi Denis-Courmont

Hi,

Le 4 février 2024 14:41:15 GMT+01:00, Michael Niedermayer 
 a écrit :
>Hi
>
>As said on IRC, i thought people knew it, but ‘the same person as before’ is 
>Thilo.
>
>Ive updated the price design suggestion for the merge task, its 16€ / commit 
>limited to 50k€
>this comes from looking at pauls fork which has around 500 commits in 2 months 
>thus
>250 commits per month, 12 months, and if we allocate 50k that end with roughly 
>16€ / commit
>if activity stays equal.

It's very different if we're talking about librempeg or some other unspecified 
fork. I could make a fork that removes MMX et al, and claim that I'm merging a 
fork.

>The task has ATM no developer on it. If a developer adds himself, he can 
>change teh task
>and specify what he proposes to merge.
>
>I am totally perplexed why every dot on every i is such a big thing.

That is the whole point of a statement of work. And I agree that it's tedious 
and possibly outright annoying...

Indeed I don't think that a semiformal open-source community with a lot of 
strong and varied opinions will carry such dotting of all i's very effectively. 
That has been one of the arguments for delegating this to a contracting IT 
company rather than to FFmpeg-devel and SPI.

>We are doing GSoC for a decade and noone cared about voting about anything in 
>it.

Again, I don't think it's a fair comparison. GSoC rules are a given set by 
Google. Maintenance is not allowed nor are vague broadly defined tasks. Also 
the mentor payment is not really a proper compensation, nor is it intended to 
be.

>The difference here is FFmpeg developers are benefiting from the money.

That's a pretty major difference.

>We send an application and a scope of work.

That's exactly why we need to have a precise scope of work to vote on this.
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] Sovereign Tech Fund

2024-02-04 Thread Rémi Denis-Courmont

Hi,

I don't believe it is appropriate to hold the vote before Derek's question is 
addressed.

We don't really know what we're voting on here.



Le 1 février 2024 20:22:14 GMT+01:00, Derek Buitenhuis 
 a écrit :
>On 1/31/2024 9:44 PM, Derek Buitenhuis wrote:
>> On 1/30/2024 1:48 AM, Michael Niedermayer wrote:
>>> https://trac.ffmpeg.org/wiki/SponsoringPrograms/STF/2024
>> 
>> Not to derail this fine thread, but what forks does the Merge Forks
>> project refer to?
>
>I do not believe this has been answered.
>
>- Derek
>
>___
>ffmpeg-devel mailing list
>ffmpeg-devel@ffmpeg.org
>https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
>To unsubscribe, visit link above, or email
>ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
>
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH 0/2] Remove SDL2 output devices

2024-02-04 Thread Rémi Denis-Courmont



Le 4 février 2024 11:11:12 GMT+01:00, Marton Balint  a écrit :
>Actually they work here on a linux box with OpenSuse 15.5. So even if they
>are broken on some setups, they are not broken everywhere, or not more broken 
>than they used to be.

No. They were always broken in terms of the design, and they are more 
technically broken than before because the threading rework exposed the design 
bugs from within fftools.

No sane application would use this. If it doesn't even work in fftools, it 
should be removed.

>Also, poper deprecation is needed here, since not only the CLI tools might use 
>these. Especially since there is no drop-in replacement.

First it's not what would be considered an API. The removal shouldn't break 
source compatibility, so deprecation won't get us anything here. Where would 
you even put the deprecation guards?

And then deprecation only makes sense if it can be fixed. Nobody has come 
forward with a practical solution to make it work, probably because there is 
not one, at least on MacOS.

>> The 'pipe:' output can be used with a real video player such as mpv, vlc, or
>> even ffplay. For cases where the user was an application using the API they
>> should supply their own renderer.
>
>Yeah, but I never liked when people piped uncompressed data... Not everything 
>that the devices support can be serialized, it is extra CPU, latency of the 
>receiving app reading from pipe is a question...

That sounds pretty minor problems for something that's purely meant for 
testing, and well, at least piping works.

>I'd be a lot more happy with this if we'd offer some replacement which has no 
>issues. Maybe a libplacebo based outdev.

That's orthogonal, and you're welcome to provide patches. But AFAICT, any video 
output device would suffer the same problems on the same platforms. You simply 
can't treat video output as a generic pipeline component, at least on Windows 
and especially MacOS.
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH 0/2] Remove SDL2 output devices

2024-02-04 Thread Rémi Denis-Courmont



Le 4 février 2024 10:02:31 GMT+01:00, "J. Dekker"  a écrit :
>With the addition of threading in ffmpeg.c, the SDL2 devices no longer have the
>'main' thread. This means that both the SDL2 and OpenGL output device are 
>broken
>in master. Rather than attempting to fix it, they should be removed instead as
>there are better alternatives for debugging or viewing streams.

This is as agreed after discussed in yesterday's technical meeting. So 
obviously I support this patchset.
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] RISC-V vector DSP functions: Motivation for commit 446b009

2024-01-19 Thread Rémi Denis-Courmont

Hi,

Le perjantaina 19. tammikuuta 2024, 17.30.00 EET Michael Platzer via ffmpeg-
devel a écrit :
> Commit 446b0090cbb66ee614dcf6ca79c78dc8eb7f0e37 by Remi Denis-Courmont has
> replaced RISC-V vector loads and stores with negative stride with vrgather
> (generalized permutation within vector registers) instructions in order to
> reverse the elements in a vector register. The commit message explains that
> this change was done, but it does not explain why.

It was faster on what the best approximation of real hardware available at the 
time, i.e. a Sipeed Lichee Pi4A board. There are no benchmarks in the commit 
because I don't like to publish benchmarks collected from prototypes. 
Nevertheless I think the commit message hints enough that anybody could easily 
guess that it was a performance optimisation, if I'm being honest.

This is not exactly surprising: typical hardware can only access so many 
memory addresses simultaneously (i.e. one or maybe two), so indexed loads and 
strided loads are bound to be much slower than unit-strided loads.

Maybe you have access to special hardware that is able to optimise the special 
case of strides equal to minus one to reduce the number of memory accesses. 
But I didn't back then, and as a matter of fact, I still don't. Hardware 
donations are welcome.

> I fail to see what could possibly have motivated this change.

> The RISC-V vector loads and stores support negative stride values for use
> cases such as this one.

[Citation required]

> Using vrgather instead replaces the more specific operation with a more
> generic one,

That is a very subjective and unsubstantiated assertion. This feels a bit 
hypocritical while you are attacking me for not providing justification.

As far as I can tell, neither instruction are specific to reversing vector 
element order. An actual real-life specific instruction exists on Arm in the 
form of vector-reverse. I don't know any ISA with load-reverse or store-
reverse.

> which is likely to be less performant on most HW architectures.

Would you care to define "most architectures"? I only know one commercially 
available hardware architecture as of today, Kendryte K230 SoC with T-Head 
C908 CPU, so I can't make much sense of your sentence here.

> In addition, it requires to setup an index vector,

That is irrelevant since in this loop, the vector bank is not a bottleneck. 
The loop can run with maximul LMUL either way. And besides, the loop turned 
out to be faster with a smaller multiplier.

> thus raising dynamic instruction count.

It adds only one instruction (reverse subtraction) in the main loop, and even 
that could be optimised away if relevant.

-- 
レミ・デニ-クールモン
http://www.remlab.net/



___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] Sovereign Tech Fund

2024-01-29 Thread Rémi Denis-Courmont

Le 30 janvier 2024 00:43:39 GMT+02:00, Michael Niedermayer 
 a écrit :
>Hi
>
>On Mon, Jan 29, 2024 at 11:01:05PM +0200, Rémi Denis-Courmont wrote:
>> Le maanantaina 29. tammikuuta 2024, 20.11.19 EET Michael Niedermayer a écrit 
>> :
>[...]
>> > Its under the control of the community and its transparent
>> 
>> You always have the control of the community at the time of review and merge.
>> 
>> You can argue all you want that more open is better. What I see is that this 
>> more open is already turning into a train wreck (as predicted last year).
>
>I do have to disagree on this specific point
>The people predicting it to be a train wreck are the people who now make it
>a train wreck.

That's clearly false and defamatory against me.

And given that you were the one to ask for feedback and project ideas that also 
constitutes entrapment.

You should step down from the CC IMO because that's very unbecoming of a CC 
member (as are your attacks against Kieran)

In these conditions I maintain that this process is inane and discriminatory.

Lastly the FFmpeg community should bot to be taken hostage in one person's 
personal feud against FFlabs and/or other companies. (This is purely 
hypothetical and not an accusation against anyone in particular. If you feel 
targeted, that's on you.)
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] Sovereign Tech Fund

2024-01-29 Thread Rémi Denis-Courmont



Le 29 janvier 2024 22:15:39 GMT+02:00, Derek Buitenhuis 
 a écrit :
>Between this, the unaswered NAB questions, the second vote ridiculousness, the
>accidental email to the ML from Thilo where he admits he has purposely not 
>replied,
>etc.,

Also
- Reject FFmpeg project's free invitation to SCaLE because he wouldn't 
participate, rather than pass it on.

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] Sovereign Tech Fund

2024-01-31 Thread Rémi Denis-Courmont

Hi,

Le keskiviikkona 31. tammikuuta 2024, 16.10.02 EET Jonatas L. Nogueira via 
ffmpeg-devel a écrit :
> > IMO hasty actions and avoidable drama may cause damage to the project
> 
> What would be a hasty action? I've seen far too much people calling action
> over stuff discussed for weeks/months as "hasty" in attempt to stall into
> endless discussions, so you might want to clarify.

Would you care to clarify which astronomical body do you count weeks and 
months in? I believe that it is customary to use Earth units when you do not 
specify. And in this case, the topic was brought to the community just about 
0.5 week, or 0.11 month ago.

Sarcasm aside, I take that to mean that SPI has been involved with those 
discussions for months in a private and closed process. Michael asserted that 
an open inclusive process is better than the usual closed approach whence the 
funding goes through a company.

It looks to me that those SPI discussions were just as opaque and closed, and 
all the talk of openess is just pretense. It does not help that Michael, and 
now you too, misrepresent any challenge to SPI proposed *process* as an 
attempt to reject the idea of STF sponsorship, under the convenient pretext 
that there is not enough time.


This is further aggravated by the context that Michael brought forward the 
idea of funding developers through SPI 3 months ago (in actual Earth units). 
From your statement, I have to infer that Thilo, Michael and SPI already knew 
of the STF plan and concealed that key piece of contextual information back 
then.

In hindsight, it feels hypocritical to me that they were arguing for the SPI 
path, and against the corporate path, on the basis of openess already then, to 
be honest.


I can only agree with Anton that this looks like an attempt to strongarm the 
community. This is ostensibly being to ignore all the objections that were 
already brought in October and are being brought again now, with the 
complicity of SPI. I can't say that this looks well on SPI, but that's just my 
personal opinion.


With all that said, I don't think anybody will attempt to prevent this from 
happening (if they even can?). But that will take place without the consent of 
the GA, without any legitimacy on the claims of openess and inclusiveness, and 
obviously without any form of preclearance from the technical appropriateness 
of the resulting code contributions.



-- 
レミ・デニ-クールモン
http://www.remlab.net/



___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] RISC-V vector DSP functions: Motivation for commit 446b009

2024-01-23 Thread Rémi Denis-Courmont

Le tiistaina 23. tammikuuta 2024, 19.34.46 EET Michael Platzer via ffmpeg-devel 
a écrit :
> I agree that the indexed and strided loads and stores are certainly slower
> than unit-strided loads and stores. However, the vrgather instruction is
> unlikely to be very performant either, unless the vector length is
> relatively short.

> Particularly, if vector register groups are used via a
> length multiplier LMUL of, e.g., 8, then any element in the destination
> vector register could be sourced from any element in the 8 source vector
> registers (i.e., 1/4 of the vector register file).

Gather instruction seem to scale quadratically on existing hardware, which is 
bad. That's why the FFmpeg code was later modified to use LMUL=1 in that 
particular case.

Now if you want to argue that VLSE is better, then please provide a patch 
exhibiting better performance on FFmpeg's checkasm on real hardware.
Otherwise, this discussion is not much more than he-said-she-said.

> By contrast, the performance of strided loads and stores, while certainly
> slower than unit-strided loads and stores, likely scales linearly with the
> vector length, so on CPUs with large VLEN the original code could very well
> run faster than the variant with vrgather, despite the slower strided loads
> and stores.

Yes, but it's a stretch to expect that accessing memory will be faster than 
accessing registers, especially when the dataset is typically too large to fit 
in L1. Furthermore strided loads require adders to compute the accessed 
address - something VRGATHER (or even VLUEXI) does not need.

Some people wish that processor cores would make a special optimised case of 
minus EEW/8 strides. And sure, that would be nice. But so far that's just 
wishful thinking.

> > > The RISC-V vector loads and stores support negative stride values for 
> > > use cases such as this one.
> > 
> > [Citation required]
>
> The purpose of strided loads and stores is to load/store elements that are
> not consecutive in memory, but instead separated by a constant offset.
> Additionally, the authors of the specification decided to allow negative
> stride values, since they apparently deemed it useful to be able to reverse
> the order of those elements.

FFmpeg *still* uses strided loads and stores where applicable, typically where 
the stride is legitimately variable. I cannot find a justification that small 
constant non-unit strides would be a good idea anywhere though.
 
Just because you can use negative offsets does not mean that this will be 
optimised for negative-unit offsets. Again, I have only seen some wishful 
thinking from some developers here and there. I have yet to see a serious 
quote from a IP vendor or a benchmark that would support this.

> > > Using vrgather instead replaces the more specific operation with a 
> > > more generic one,
> > 
> > 
> > That is a very subjective and unsubstantiated assertion. This feels a bit
> > hypocritical while you are attacking me for not providing justification.
> 
> vrgather is more generic because it can be used for any kind of permutation,
> which strided loads and stores cannot. This is not subjective.

That would be a fair comparison of vrgather with hypothetical vreverse or 
vtranspose instructions. But you're comparing apples and oranges here.
 
> > As far as I can tell, neither instruction are specific to reversing vector
> > element order. An actual real-life specific instruction exists on Arm in
> > the form of vector-reverse. I don't know any ISA with load-reverse or
> > store- reverse.
> 
> A load-reverse or store-reverse would just be a special case of strided
> load/store.

By that logic, a unit-stride load is just a special case of a strided load, 
and a strided load is just a special case of an indexed load. From an 
architectural functional standpoint, that is indeed definitely true. From a 
hardware silicon design and microbenchmark standpoint, that is however 
certainly false.
 
> When writing about the performance of vrgather I primarily had the
> scalability issues explained above in mind. It seems that you have already
> experienced these, since you found that a larger LMUL reduces the
> performance of vrgather.

> How would the reverse subtraction be optimized away? I assume that it needs
> to be part of the loop since it depends on the VL of the current iteration.

VRSUB computes the same vector at all but the last two iterations. All you 
need to do is make a special case for the tail iterations. Then VRSUB can be 
ran just twice for the whole function, zero times per loop iteration.

-- 
雷米‧德尼-库尔蒙
http://www.remlab.net/



___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH] lavc/llviddsp: R-V V add_bytes

2023-11-15 Thread Rémi Denis-Courmont

add_bytes_c:  2077.2
add_bytes_rvv_i32: 105.0
---
 libavcodec/lossless_videodsp.c   |  2 ++
 libavcodec/lossless_videodsp.h   |  1 +
 libavcodec/riscv/Makefile|  2 ++
 libavcodec/riscv/llviddsp_init.c | 38 
 libavcodec/riscv/llviddsp_rvv.S  | 36 ++
 5 files changed, 79 insertions(+)
 create mode 100644 libavcodec/riscv/llviddsp_init.c
 create mode 100644 libavcodec/riscv/llviddsp_rvv.S

diff --git a/libavcodec/lossless_videodsp.c b/libavcodec/lossless_videodsp.c
index 359606981c..876decb1e6 100644
--- a/libavcodec/lossless_videodsp.c
+++ b/libavcodec/lossless_videodsp.c
@@ -121,6 +121,8 @@ void ff_llviddsp_init(LLVidDSPContext *c)
 
 #if ARCH_PPC
 ff_llviddsp_init_ppc(c);
+#elif ARCH_RISCV
+ff_llviddsp_init_riscv(c);
 #elif ARCH_X86
 ff_llviddsp_init_x86(c);
 #endif
diff --git a/libavcodec/lossless_videodsp.h b/libavcodec/lossless_videodsp.h
index da4baa1414..5309ce4be7 100644
--- a/libavcodec/lossless_videodsp.h
+++ b/libavcodec/lossless_videodsp.h
@@ -40,6 +40,7 @@ typedef struct LLVidDSPContext {
 } LLVidDSPContext;
 
 void ff_llviddsp_init(LLVidDSPContext *llviddsp);
+void ff_llviddsp_init_riscv(LLVidDSPContext *llviddsp);
 void ff_llviddsp_init_x86(LLVidDSPContext *llviddsp);
 void ff_llviddsp_init_ppc(LLVidDSPContext *llviddsp);
 
diff --git a/libavcodec/riscv/Makefile b/libavcodec/riscv/Makefile
index d34dc77458..8f2a519827 100644
--- a/libavcodec/riscv/Makefile
+++ b/libavcodec/riscv/Makefile
@@ -28,6 +28,8 @@ OBJS-$(CONFIG_IDCTDSP) += riscv/idctdsp_init.o
 RVV-OBJS-$(CONFIG_IDCTDSP) += riscv/idctdsp_rvv.o
 OBJS-$(CONFIG_LLAUDDSP) += riscv/llauddsp_init.o
 RVV-OBJS-$(CONFIG_LLAUDDSP) += riscv/llauddsp_rvv.o
+OBJS-$(CONFIG_LLVIDDSP) += riscv/llviddsp_init.o
+RVV-OBJS-$(CONFIG_LLVIDDSP) += riscv/llviddsp_rvv.o
 OBJS-$(CONFIG_OPUS_DECODER) += riscv/opusdsp_init.o
 RVV-OBJS-$(CONFIG_OPUS_DECODER) += riscv/opusdsp_rvv.o
 OBJS-$(CONFIG_PIXBLOCKDSP) += riscv/pixblockdsp_init.o \
diff --git a/libavcodec/riscv/llviddsp_init.c b/libavcodec/riscv/llviddsp_init.c
new file mode 100644
index 00..f042eeab32
--- /dev/null
+++ b/libavcodec/riscv/llviddsp_init.c
@@ -0,0 +1,38 @@
+/*
+ * Copyright © 2023 Rémi Denis-Courmont.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavcodec/lossless_videodsp.h"
+
+void ff_llvid_add_bytes_rvv(uint8_t *, uint8_t *src, ptrdiff_t w);
+
+av_cold void ff_llviddsp_init_riscv(LLVidDSPContext *c)
+{
+#if HAVE_RVV
+int flags = av_get_cpu_flags();
+
+if (flags & AV_CPU_FLAG_RVV_I32) {
+c->add_bytes = ff_llvid_add_bytes_rvv;
+}
+#endif
+}
diff --git a/libavcodec/riscv/llviddsp_rvv.S b/libavcodec/riscv/llviddsp_rvv.S
new file mode 100644
index 00..a4814837b9
--- /dev/null
+++ b/libavcodec/riscv/llviddsp_rvv.S
@@ -0,0 +1,36 @@
+/*
+ * Copyright © 2023 Rémi Denis-Courmont.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/riscv/asm.S"
+
+func ff_llvid_add_bytes_rvv, zve32x
+1:
+vsetvli t0, a2, e8, m8, ta, ma
+vle8.v  v0, (a1)
+sub a2, a2, t0
+vle8.v  v8, (a0)
+add a1, t0, a1
+vadd.vv v8, v0, v8
+vse8.v  v8, (a0)
+add a0, t0, a0
+bneza2, 1b
+
+ret
+endfunc
-- 
2.42.0

___
ffmpeg-devel mailing list
ffmpeg-devel@ff

Re: [FFmpeg-devel] [PATCH] af_afir: RISC-V V fcmul_add

2023-11-15 Thread Rémi Denis-Courmont

Le keskiviikkona 15. marraskuuta 2023, 10.59.55 EET flow gg a écrit :
> Okay, I have updated these issues in the patch.

It does not assemble but I can fix it locally. The narrowing shift trickery 
require Zve64x, or rather Zve64f in this case.

The performance improvement is much better on newer hardware:
fcmul_add_c: 4891.2
fcmul_add_rvv_f64: 2399.5

FWIW, VLSEG2E32.V remains slightly worse than with shifting:
fcmul_add_c: 4891.2
fcmul_add_rvv_f32: 2877.5

-- 
雷米‧德尼-库尔蒙
http://www.remlab.net/



___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH] checkasm/flacdsp: add LPC test

2023-11-15 Thread Rémi Denis-Courmont

---
 tests/checkasm/flacdsp.c | 28 
 1 file changed, 28 insertions(+)

diff --git a/tests/checkasm/flacdsp.c b/tests/checkasm/flacdsp.c
index 51a0e0060b..589a3fe834 100644
--- a/tests/checkasm/flacdsp.c
+++ b/tests/checkasm/flacdsp.c
@@ -54,6 +54,27 @@ static void check_decorrelate(uint8_t **ref_dst, uint8_t 
**ref_src, uint8_t **ne
 bench_new(new_dst, (int32_t **)new_src, channels, BUF_SIZE / 
sizeof(int32_t), 8);
 }
 
+static void check_lpc(FLACDSPContext *c)
+{
+int pred_order = (rnd() % 32) + 1;
+int qlevel = rnd() % 16;
+LOCAL_ALIGNED_16(int32_t, coeffs, [32]);
+LOCAL_ALIGNED_16(int32_t, dst0, [BUF_SIZE]);
+LOCAL_ALIGNED_16(int32_t, dst1, [BUF_SIZE]);
+
+declare_func(void, int32_t *, const int[32], int, int, int);
+
+for (int i = 0; i < BUF_SIZE; i++)
+dst0[i] = rnd();
+
+memcpy(dst1, dst0, BUF_SIZE * sizeof (int32_t));
+call_ref(dst0, coeffs, pred_order, qlevel, BUF_SIZE);
+call_new(dst1, coeffs, pred_order, qlevel, BUF_SIZE);
+if (memcmp(dst0, dst1, BUF_SIZE * sizeof (int32_t)) != 0)
+   fail();
+bench_new(dst1, coeffs, pred_order, qlevel, BUF_SIZE);
+}
+
 void checkasm_check_flacdsp(void)
 {
 LOCAL_ALIGNED_16(uint8_t, ref_dst, [BUF_SIZE*MAX_CHANNELS]);
@@ -88,4 +109,11 @@ void checkasm_check_flacdsp(void)
 }
 
 report("decorrelate");
+
+if (check_func(h.lpc16, "flac_lpc_16"))
+check_lpc();
+if (check_func(h.lpc32, "flac_lpc_32"))
+check_lpc();
+
+report("lpc");
 }
-- 
2.42.0

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH] checkasm/flacdsp: add LPC test

2023-11-15 Thread Rémi Denis-Courmont

Le keskiviikkona 15. marraskuuta 2023, 18.21.34 EET Rémi Denis-Courmont a 
écrit :
> ---
>  tests/checkasm/flacdsp.c | 28 
>  1 file changed, 28 insertions(+)
> 
> diff --git a/tests/checkasm/flacdsp.c b/tests/checkasm/flacdsp.c
> index 51a0e0060b..589a3fe834 100644
> --- a/tests/checkasm/flacdsp.c
> +++ b/tests/checkasm/flacdsp.c
> @@ -54,6 +54,27 @@ static void check_decorrelate(uint8_t **ref_dst, uint8_t
> **ref_src, uint8_t **ne bench_new(new_dst, (int32_t **)new_src, channels,
> BUF_SIZE / sizeof(int32_t), 8); }
> 
> +static void check_lpc(FLACDSPContext *c)
> +{
> +int pred_order = (rnd() % 32) + 1;
> +int qlevel = rnd() % 16;
> +LOCAL_ALIGNED_16(int32_t, coeffs, [32]);
> +LOCAL_ALIGNED_16(int32_t, dst0, [BUF_SIZE]);
> +LOCAL_ALIGNED_16(int32_t, dst1, [BUF_SIZE]);
> +
> +declare_func(void, int32_t *, const int[32], int, int, int);

Hmmph, nevermind, forgot to initialise the coefficients.

> +
> +for (int i = 0; i < BUF_SIZE; i++)
> +dst0[i] = rnd();
> +
> +memcpy(dst1, dst0, BUF_SIZE * sizeof (int32_t));
> +call_ref(dst0, coeffs, pred_order, qlevel, BUF_SIZE);
> +call_new(dst1, coeffs, pred_order, qlevel, BUF_SIZE);
> +if (memcmp(dst0, dst1, BUF_SIZE * sizeof (int32_t)) != 0)
> +   fail();
> +bench_new(dst1, coeffs, pred_order, qlevel, BUF_SIZE);
> +}
> +
>  void checkasm_check_flacdsp(void)
>  {
>  LOCAL_ALIGNED_16(uint8_t, ref_dst, [BUF_SIZE*MAX_CHANNELS]);
> @@ -88,4 +109,11 @@ void checkasm_check_flacdsp(void)
>  }
> 
>  report("decorrelate");
> +
> +if (check_func(h.lpc16, "flac_lpc_16"))
> +check_lpc();
> +if (check_func(h.lpc32, "flac_lpc_32"))
> +check_lpc();
> +
> +report("lpc");
>  }


-- 
Rémi Denis-Courmont
http://www.remlab.net/



___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH 1/2] checkasm/flacdsp: add LPC test

2023-11-15 Thread Rémi Denis-Courmont

Le keskiviikkona 15. marraskuuta 2023, 21.14.26 EET James Almer a écrit :
> On 11/15/2023 3:02 PM, Rémi Denis-Courmont wrote:
> > ---
> > 
> >   tests/checkasm/flacdsp.c | 32 
> >   1 file changed, 32 insertions(+)
> > 
> > diff --git a/tests/checkasm/flacdsp.c b/tests/checkasm/flacdsp.c
> > index 51a0e0060b..4d69cbe507 100644
> > --- a/tests/checkasm/flacdsp.c
> > +++ b/tests/checkasm/flacdsp.c
> > @@ -54,6 +54,28 @@ static void check_decorrelate(uint8_t **ref_dst,
> > uint8_t **ref_src, uint8_t **ne> 
> >   bench_new(new_dst, (int32_t **)new_src, channels, BUF_SIZE /
> >   sizeof(int32_t), 8);>   
> >   }
> > 
> > +static void check_lpc(FLACDSPContext *c, int pred_order)
> 
> c is unused.
> 
> > +{
> > +int qlevel = rnd() % 16;
> > +LOCAL_ALIGNED_16(int32_t, coeffs, [32]);
> > +LOCAL_ALIGNED_16(int32_t, dst0, [BUF_SIZE]);
> > +LOCAL_ALIGNED_16(int32_t, dst1, [BUF_SIZE]);
> > +
> > +declare_func(void, int32_t *, const int[32], int, int, int);
> > +
> > +for (int i = 0; i < 32; i++)
> > +coeffs[i] = rnd();
> > +for (int i = 0; i < BUF_SIZE; i++)
> > +dst0[i] = rnd();
> > +
> > +memcpy(dst1, dst0, BUF_SIZE * sizeof (int32_t));
> > +call_ref(dst0, coeffs, pred_order, qlevel, BUF_SIZE);
> > +call_new(dst1, coeffs, pred_order, qlevel, BUF_SIZE);
> > +if (memcmp(dst0, dst1, BUF_SIZE * sizeof (int32_t)) != 0)
> > +   fail();
> > +bench_new(dst1, coeffs, pred_order, qlevel, BUF_SIZE);
> 
> Not sure if it matters, but dst1 is already trashed by call_new().

Yeah I know. I could allocate a third buffer. AFAICT, the only parameter that 
should affect the benchmarks is pred-order (which indeed affects the result on 
both x86 and RVV). So that the extra code to preserve dst seemed pointless?

> 
> > +}
> > +
> > 
> >   void checkasm_check_flacdsp(void)
> >   {
> >   
> >   LOCAL_ALIGNED_16(uint8_t, ref_dst, [BUF_SIZE*MAX_CHANNELS]);
> > 
> > @@ -72,6 +94,7 @@ void checkasm_check_flacdsp(void)
> > 
> >   { AV_SAMPLE_FMT_S16, 16 },
> >   { AV_SAMPLE_FMT_S32, 32 },
> >   
> >   };
> > 
> > +static const signed char pred_orders[] = { 13, 16, 29, 32 };
> > 
> >   FLACDSPContext h;
> >   int i, j;
> > 
> > @@ -88,4 +111,13 @@ void checkasm_check_flacdsp(void)
> > 
> >   }
> >   
> >   report("decorrelate");
> > 
> > +
> > +for (int i = 0; i < sizeof (pred_orders); i++) {
> 
> i is already defined. Also, use FF_ARRAY_ELEMS(pred_orders), so it
> doesn't depend on char being 1 byte.
> 
> > +if (check_func(h.lpc16, "flac_lpc_16_%d", pred_orders[i]))
> > +check_lpc(, pred_orders[i]);
> > +if (check_func(h.lpc32, "flac_lpc_32_%d", pred_orders[i]))
> > +check_lpc(, pred_orders[i]);
> > +}
> > +
> > +report("lpc");
> > 
> >   }
> 
> LGTM otherwise.
> ___
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> 
> To unsubscribe, visit link above, or email
> ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


-- 
雷米‧德尼-库尔蒙
http://www.remlab.net/



___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCHv3] checkasm/flacdsp: add LPC test

2023-11-15 Thread Rémi Denis-Courmont

---
 tests/checkasm/flacdsp.c | 32 
 1 file changed, 32 insertions(+)

diff --git a/tests/checkasm/flacdsp.c b/tests/checkasm/flacdsp.c
index 51a0e0060b..b308237db1 100644
--- a/tests/checkasm/flacdsp.c
+++ b/tests/checkasm/flacdsp.c
@@ -54,6 +54,28 @@ static void check_decorrelate(uint8_t **ref_dst, uint8_t 
**ref_src, uint8_t **ne
 bench_new(new_dst, (int32_t **)new_src, channels, BUF_SIZE / 
sizeof(int32_t), 8);
 }
 
+static void check_lpc(int pred_order)
+{
+int qlevel = rnd() % 16;
+LOCAL_ALIGNED_16(int32_t, coeffs, [32]);
+LOCAL_ALIGNED_16(int32_t, dst0, [BUF_SIZE]);
+LOCAL_ALIGNED_16(int32_t, dst1, [BUF_SIZE]);
+
+declare_func(void, int32_t *, const int[32], int, int, int);
+
+for (int i = 0; i < 32; i++)
+coeffs[i] = rnd();
+for (int i = 0; i < BUF_SIZE; i++)
+dst0[i] = rnd();
+
+memcpy(dst1, dst0, BUF_SIZE * sizeof (int32_t));
+call_ref(dst0, coeffs, pred_order, qlevel, BUF_SIZE);
+call_new(dst1, coeffs, pred_order, qlevel, BUF_SIZE);
+if (memcmp(dst0, dst1, BUF_SIZE * sizeof (int32_t)) != 0)
+   fail();
+bench_new(dst1, coeffs, pred_order, qlevel, BUF_SIZE);
+}
+
 void checkasm_check_flacdsp(void)
 {
 LOCAL_ALIGNED_16(uint8_t, ref_dst, [BUF_SIZE*MAX_CHANNELS]);
@@ -72,6 +94,7 @@ void checkasm_check_flacdsp(void)
 { AV_SAMPLE_FMT_S16, 16 },
 { AV_SAMPLE_FMT_S32, 32 },
 };
+static const signed char pred_orders[] = { 13, 16, 29, 32 };
 FLACDSPContext h;
 int i, j;
 
@@ -88,4 +111,13 @@ void checkasm_check_flacdsp(void)
 }
 
 report("decorrelate");
+
+for (i = 0; i < FF_ARRAY_ELEMS(pred_orders); i++)
+if (check_func(h.lpc16, "flac_lpc_16_%d", pred_orders[i]))
+check_lpc(pred_orders[i]);
+for (i = 0; i < FF_ARRAY_ELEMS(pred_orders); i++)
+if (check_func(h.lpc32, "flac_lpc_32_%d", pred_orders[i]))
+check_lpc(pred_orders[i]);
+
+report("lpc");
 }
-- 
2.42.0

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH] lavc/flacdsp: R-V V LPC16 function

2023-11-15 Thread Rémi Denis-Courmont

In this case, the inner loop computing the scalar product can be reduced
to just one multiplication and one sum even with 128-bit vectors. The
result is a lot simpler, but also brings more modest performance gains:

flac_lpc_16_13_c:   15241.0
flac_lpc_16_13_rvv_i32: 11230.0
flac_lpc_16_16_c:   17884.0
flac_lpc_16_16_rvv_i32: 12125.7
flac_lpc_16_29_c:   27847.7
flac_lpc_16_29_rvv_i32: 10494.0
flac_lpc_16_32_c:   30051.5
flac_lpc_16_32_rvv_i32: 10355.0
---
 libavcodec/riscv/flacdsp_init.c | 17 -
 libavcodec/riscv/flacdsp_rvv.S  | 23 +++
 2 files changed, 35 insertions(+), 5 deletions(-)

diff --git a/libavcodec/riscv/flacdsp_init.c b/libavcodec/riscv/flacdsp_init.c
index f60f98ea31..6cfb50ead8 100644
--- a/libavcodec/riscv/flacdsp_init.c
+++ b/libavcodec/riscv/flacdsp_init.c
@@ -25,6 +25,8 @@
 #include "libavutil/riscv/cpu.h"
 #include "libavcodec/flacdsp.h"
 
+void ff_flac_lpc16_rvv(int32_t *decoded, const int coeffs[32],
+   int pred_order, int qlevel, int len);
 void ff_flac_lpc32_rvv(int32_t *decoded, const int coeffs[32],
int pred_order, int qlevel, int len);
 void ff_flac_lpc32_rvv_simple(int32_t *decoded, const int coeffs[32],
@@ -61,16 +63,20 @@ void ff_flac_decorrelate_ms_32_rvv(uint8_t **out, int32_t 
**in,
 av_cold void ff_flacdsp_init_riscv(FLACDSPContext *c, enum AVSampleFormat fmt,
int channels)
 {
-#if HAVE_RVV && (__riscv_xlen >= 64)
+#if HAVE_RVV
 int flags = av_get_cpu_flags();
 
 if ((flags & AV_CPU_FLAG_RVV_I32) && (flags & AV_CPU_FLAG_RVB_ADDR)) {
 int vlenb = ff_get_rv_vlenb();
 
-if (vlenb == 16)
-c->lpc32 = ff_flac_lpc32_rvv;
-else if (vlenb > 16) 
-c->lpc32 = ff_flac_lpc32_rvv_simple;
+if (vlenb >= 16) {
+c->lpc16 = ff_flac_lpc16_rvv;
+# if (__riscv_xlen >= 64)
+if (vlenb > 16)
+c->lpc32 = ff_flac_lpc32_rvv_simple;
+else
+c->lpc32 = ff_flac_lpc32_rvv;
+}
 
 switch (fmt) {
 case AV_SAMPLE_FMT_S16:
@@ -111,6 +117,7 @@ av_cold void ff_flacdsp_init_riscv(FLACDSPContext *c, enum 
AVSampleFormat fmt,
 c->decorrelate[2] = ff_flac_decorrelate_rs_32_rvv;
 c->decorrelate[3] = ff_flac_decorrelate_ms_32_rvv;
 break;
+# endif
 }
 }
 #endif
diff --git a/libavcodec/riscv/flacdsp_rvv.S b/libavcodec/riscv/flacdsp_rvv.S
index b1724f5500..2a0b50f7a9 100644
--- a/libavcodec/riscv/flacdsp_rvv.S
+++ b/libavcodec/riscv/flacdsp_rvv.S
@@ -20,6 +20,29 @@
 
 #include "libavutil/riscv/asm.S"
 
+func ff_flac_lpc16_rvv, zve32x
+vsetvli zero, a2, e32, m8, ta, ma
+vle32.v v8, (a1)
+sub a4, a4, a2
+vle32.v v16, (a0)
+sh2add  a0, a2, a0
+vmv.s.x v0, zero
+1:
+vmul.vv v24, v8, v16
+lw  t0, (a0)
+vredsum.vs v24, v24, v0
+addia4, a4, -1
+vmv.x.s t1, v24
+sra t1, t1, a3
+add t0, t0, t1
+vslide1down.vx v16, v16, t0
+sw  t0, (a0)
+addia0, a0, 4
+bneza4, 1b
+
+ret
+endfunc
+
 #if (__riscv_xlen == 64)
 func ff_flac_lpc32_rvv, zve32x
 addit2, a2, -16
-- 
2.42.0

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH] riscv: fix builds without Zbb support

2023-11-15 Thread Rémi Denis-Courmont

---
 libavutil/riscv/asm.S | 5 +
 1 file changed, 5 insertions(+)

diff --git a/libavutil/riscv/asm.S b/libavutil/riscv/asm.S
index 6ca74f263a..0a9e2e0d3f 100644
--- a/libavutil/riscv/asm.S
+++ b/libavutil/riscv/asm.S
@@ -92,6 +92,11 @@
 shnadd  3, \rd, \rs1, \rs2
 .endm
 #endif
+#if !defined (__riscv_zbb)
+.macro  min rd, rs1, rs2
+.insn r OP, 4, 5, \rd, \rs1, \rs2
+.endm
+#endif
 
 /* Convenience macro to load a Vector type (vtype) as immediate */
 .macro  lvtypei rd, e, m=m1, tp=tu, mp=mu
-- 
2.42.0

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH 1/2] checkasm/flacdsp: add LPC test

2023-11-15 Thread Rémi Denis-Courmont

---
 tests/checkasm/flacdsp.c | 32 
 1 file changed, 32 insertions(+)

diff --git a/tests/checkasm/flacdsp.c b/tests/checkasm/flacdsp.c
index 51a0e0060b..4d69cbe507 100644
--- a/tests/checkasm/flacdsp.c
+++ b/tests/checkasm/flacdsp.c
@@ -54,6 +54,28 @@ static void check_decorrelate(uint8_t **ref_dst, uint8_t 
**ref_src, uint8_t **ne
 bench_new(new_dst, (int32_t **)new_src, channels, BUF_SIZE / 
sizeof(int32_t), 8);
 }
 
+static void check_lpc(FLACDSPContext *c, int pred_order)
+{
+int qlevel = rnd() % 16;
+LOCAL_ALIGNED_16(int32_t, coeffs, [32]);
+LOCAL_ALIGNED_16(int32_t, dst0, [BUF_SIZE]);
+LOCAL_ALIGNED_16(int32_t, dst1, [BUF_SIZE]);
+
+declare_func(void, int32_t *, const int[32], int, int, int);
+
+for (int i = 0; i < 32; i++)
+coeffs[i] = rnd();
+for (int i = 0; i < BUF_SIZE; i++)
+dst0[i] = rnd();
+
+memcpy(dst1, dst0, BUF_SIZE * sizeof (int32_t));
+call_ref(dst0, coeffs, pred_order, qlevel, BUF_SIZE);
+call_new(dst1, coeffs, pred_order, qlevel, BUF_SIZE);
+if (memcmp(dst0, dst1, BUF_SIZE * sizeof (int32_t)) != 0)
+   fail();
+bench_new(dst1, coeffs, pred_order, qlevel, BUF_SIZE);
+}
+
 void checkasm_check_flacdsp(void)
 {
 LOCAL_ALIGNED_16(uint8_t, ref_dst, [BUF_SIZE*MAX_CHANNELS]);
@@ -72,6 +94,7 @@ void checkasm_check_flacdsp(void)
 { AV_SAMPLE_FMT_S16, 16 },
 { AV_SAMPLE_FMT_S32, 32 },
 };
+static const signed char pred_orders[] = { 13, 16, 29, 32 };
 FLACDSPContext h;
 int i, j;
 
@@ -88,4 +111,13 @@ void checkasm_check_flacdsp(void)
 }
 
 report("decorrelate");
+
+for (int i = 0; i < sizeof (pred_orders); i++) {
+if (check_func(h.lpc16, "flac_lpc_16_%d", pred_orders[i]))
+check_lpc(, pred_orders[i]);
+if (check_func(h.lpc32, "flac_lpc_32_%d", pred_orders[i]))
+check_lpc(, pred_orders[i]);
+}
+
+report("lpc");
 }
-- 
2.42.0

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH 2/2] lavc/flacdsp: R-V V LPC32

2023-11-15 Thread Rémi Denis-Courmont

The entire set of 32 coefficients and corresponding past 32 samples can
fit in a single vector (with LMUL=8) exactly, but... since widening
double the needed vector sizes, we still end up too short with 128-bit
vectors. This adds a very simple version for future 256+-bit hardware,
and for pred_orders values up to 16, and a bit more involved loop for
for 128-bit hardware with pred_orders between 17 and 32.

With 128-bit hardware, the benchmarks look like this:
flac_lpc_32_13_c:   30152.0
flac_lpc_32_13_rvv_i32: 10244.7
flac_lpc_32_16_c:   37314.2
flac_lpc_32_16_rvv_i32: 10126.2
flac_lpc_32_29_c:   61910.0
flac_lpc_32_29_rvv_i32: 14495.2
flac_lpc_32_32_c:   68204.0
flac_lpc_32_32_rvv_i32: 13273.7
---
 libavcodec/riscv/flacdsp_init.c | 12 +++
 libavcodec/riscv/flacdsp_rvv.S  | 57 +
 2 files changed, 69 insertions(+)

diff --git a/libavcodec/riscv/flacdsp_init.c b/libavcodec/riscv/flacdsp_init.c
index 73d431cb77..f60f98ea31 100644
--- a/libavcodec/riscv/flacdsp_init.c
+++ b/libavcodec/riscv/flacdsp_init.c
@@ -22,8 +22,13 @@
 
 #include "libavutil/attributes.h"
 #include "libavutil/cpu.h"
+#include "libavutil/riscv/cpu.h"
 #include "libavcodec/flacdsp.h"
 
+void ff_flac_lpc32_rvv(int32_t *decoded, const int coeffs[32],
+   int pred_order, int qlevel, int len);
+void ff_flac_lpc32_rvv_simple(int32_t *decoded, const int coeffs[32],
+  int pred_order, int qlevel, int len);
 void ff_flac_decorrelate_indep2_16_rvv(uint8_t **out, int32_t **in,
int channels, int len, int shift);
 void ff_flac_decorrelate_indep4_16_rvv(uint8_t **out, int32_t **in,
@@ -60,6 +65,13 @@ av_cold void ff_flacdsp_init_riscv(FLACDSPContext *c, enum 
AVSampleFormat fmt,
 int flags = av_get_cpu_flags();
 
 if ((flags & AV_CPU_FLAG_RVV_I32) && (flags & AV_CPU_FLAG_RVB_ADDR)) {
+int vlenb = ff_get_rv_vlenb();
+
+if (vlenb == 16)
+c->lpc32 = ff_flac_lpc32_rvv;
+else if (vlenb > 16) 
+c->lpc32 = ff_flac_lpc32_rvv_simple;
+
 switch (fmt) {
 case AV_SAMPLE_FMT_S16:
 switch (channels) {
diff --git a/libavcodec/riscv/flacdsp_rvv.S b/libavcodec/riscv/flacdsp_rvv.S
index 12b456f7da..b1724f5500 100644
--- a/libavcodec/riscv/flacdsp_rvv.S
+++ b/libavcodec/riscv/flacdsp_rvv.S
@@ -21,6 +21,63 @@
 #include "libavutil/riscv/asm.S"
 
 #if (__riscv_xlen == 64)
+func ff_flac_lpc32_rvv, zve32x
+addit2, a2, -16
+ble t2, zero, ff_flac_lpc32_rvv_simple
+vsetivli zero, 1, e64, m1, ta, ma
+vmv.s.x v0, zero
+vsetvli zero, a2, e32, m8, ta, ma
+vle32.v v8, (a1)
+sub a4, a4, a2
+vle32.v v16, (a0)
+sh2add  a0, a2, a0
+1:
+vsetvli zero, a2, e32, m4, ta, ma
+vwmul.vv v24, v8, v16
+vsetvli zero, t2, e32, m4, tu, ma
+vwmacc.vv v24, v12, v20
+vsetvli zero, a2, e64, m8, ta, ma
+vredsum.vs v24, v24, v0
+lw  t0, (a0)
+addia4, a4, -1
+vmv.x.s t1, v24
+vsetvli zero, a2, e32, m8, ta, ma
+sra t1, t1, a3
+add t0, t0, t1
+vslide1down.vx v16, v16, t0
+sw  t0, (a0)
+addia0, a0, 4
+bneza4, 1b
+
+ret
+endfunc
+
+func ff_flac_lpc32_rvv_simple, zve32x
+vsetivli zero, 1, e64, m1, ta, ma
+vmv.s.x v0, zero
+vsetvli zero, a2, e32, m4, ta, ma
+vle32.v v8, (a1)
+sub a4, a4, a2
+vle32.v v16, (a0)
+sh2add  a0, a2, a0
+1:
+vwmul.vv v24, v8, v16
+vsetvli zero, zero, e64, m8, ta, ma
+vredsum.vs v24, v24, v0
+lw  t0, (a0)
+addia4, a4, -1
+vmv.x.s t1, v24
+vsetvli zero, zero, e32, m4, ta, ma
+sra t1, t1, a3
+add t0, t0, t1
+vslide1down.vx v16, v16, t0
+sw  t0, (a0)
+addia0, a0, 4
+bneza4, 1b
+
+ret
+endfunc
+
 func ff_flac_decorrelate_indep2_16_rvv, zve32x
 ld  a0,  (a0)
 ld  a2, 8(a1)
-- 
2.42.0

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH 1/2] riscv: indent code

2023-11-18 Thread Rémi Denis-Courmont

This reindents code to prepare for the next changeset.
No functional changes.
---
 libavutil/riscv/cpu.c | 28 +++-
 1 file changed, 15 insertions(+), 13 deletions(-)

diff --git a/libavutil/riscv/cpu.c b/libavutil/riscv/cpu.c
index 460d3e9f91..984293aef0 100644
--- a/libavutil/riscv/cpu.c
+++ b/libavutil/riscv/cpu.c
@@ -32,21 +32,23 @@ int ff_get_cpu_flags_riscv(void)
 {
 int ret = 0;
 #if HAVE_GETAUXVAL
-const unsigned long hwcap = getauxval(AT_HWCAP);
+{
+const unsigned long hwcap = getauxval(AT_HWCAP);
 
-if (hwcap & HWCAP_RV('I'))
-ret |= AV_CPU_FLAG_RVI;
-if (hwcap & HWCAP_RV('F'))
-ret |= AV_CPU_FLAG_RVF;
-if (hwcap & HWCAP_RV('D'))
-ret |= AV_CPU_FLAG_RVD;
-if (hwcap & HWCAP_RV('B'))
-ret |= AV_CPU_FLAG_RVB_ADDR | AV_CPU_FLAG_RVB_BASIC;
+if (hwcap & HWCAP_RV('I'))
+ret |= AV_CPU_FLAG_RVI;
+if (hwcap & HWCAP_RV('F'))
+ret |= AV_CPU_FLAG_RVF;
+if (hwcap & HWCAP_RV('D'))
+ret |= AV_CPU_FLAG_RVD;
+if (hwcap & HWCAP_RV('B'))
+ret |= AV_CPU_FLAG_RVB_ADDR | AV_CPU_FLAG_RVB_BASIC;
 
-/* The V extension implies all Zve* functional subsets */
-if (hwcap & HWCAP_RV('V'))
-ret |= AV_CPU_FLAG_RVV_I32 | AV_CPU_FLAG_RVV_I64
- | AV_CPU_FLAG_RVV_F32 | AV_CPU_FLAG_RVV_F64;
+/* The V extension implies all Zve* functional subsets */
+if (hwcap & HWCAP_RV('V'))
+ ret |= AV_CPU_FLAG_RVV_I32 | AV_CPU_FLAG_RVV_I64
+  | AV_CPU_FLAG_RVV_F32 | AV_CPU_FLAG_RVV_F64;
+}
 #endif
 
 #ifdef __riscv_i
-- 
2.42.0

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH 2/2] riscv: add hwprobe() for CPU detection

2023-11-18 Thread Rémi Denis-Courmont

This adds the Linux-specific system call to detect CPU features. Unlike
the auxillary vector, this supports extension other than single lettered
ones. (The API is kind of a mess though.)

At the moment, we need this to detect Zba and Zbb at run-time.
---
 configure |  5 +
 libavutil/riscv/cpu.c | 43 +++
 2 files changed, 48 insertions(+)

diff --git a/configure b/configure
index 6be849fc08..a6039c1476 100755
--- a/configure
+++ b/configure
@@ -2202,6 +2202,7 @@ HAVE_LIST_PUB="
 
 HEADERS_LIST="
 arpa_inet_h
+asm_hwprobe_h
 asm_types_h
 cdio_paranoia_h
 cdio_paranoia_paranoia_h
@@ -2227,6 +2228,7 @@ HEADERS_LIST="
 opencv2_core_core_c_h
 OpenGL_gl3_h
 poll_h
+sys_hwprobe_h
 sys_param_h
 sys_resource_h
 sys_select_h
@@ -5410,6 +5412,9 @@ elif enabled ppc; then
 
 elif enabled riscv; then
 
+check_headers asm/hwprobe.h
+check_headers sys/hwprobe.h
+
 if test_cpp_condition stddef.h "__riscv_zbb"; then
 enable fast_clz
 fi
diff --git a/libavutil/riscv/cpu.c b/libavutil/riscv/cpu.c
index 984293aef0..23e49767c2 100644
--- a/libavutil/riscv/cpu.c
+++ b/libavutil/riscv/cpu.c
@@ -18,8 +18,10 @@
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
+#define _GNU_SOURCE
 #include "libavutil/cpu.h"
 #include "libavutil/cpu_internal.h"
+#include "libavutil/macros.h"
 #include "libavutil/log.h"
 #include "config.h"
 
@@ -27,10 +29,51 @@
 #include 
 #define HWCAP_RV(letter) (1ul << ((letter) - 'A'))
 #endif
+#if defined (HAVE_SYS_HWPROBE_H)
+#include 
+#elif defined (HAVE_ASM_HWPROBE_H)
+#include 
+#include 
+#include 
+
+static int __riscv_hwprobe(struct riscv_hwprobe *pairs, size_t pair_count,
+   size_t cpu_count, unsigned long *cpus,
+   unsigned int flags)
+{
+return syscall(__NR_riscv_hwprobe, pairs, pair_count, cpu_count, cpus,
+   flags);
+}
+#endif
 
 int ff_get_cpu_flags_riscv(void)
 {
 int ret = 0;
+#if defined (HAVE_SYS_HWPROBE_H) || defined (HAVE_ASM_HWPROBE_H)
+struct riscv_hwprobe pairs[] = {
+{ RISCV_HWPROBE_KEY_BASE_BEHAVIOR, 0 },
+{ RISCV_HWPROBE_KEY_IMA_EXT_0, 0 },
+};
+
+if (__riscv_hwprobe(pairs, FF_ARRAY_ELEMS(pairs), 0, NULL, 0) == 0) {
+if (pairs[0].value & RISCV_HWPROBE_BASE_BEHAVIOR_IMA)
+ret |= AV_CPU_FLAG_RVI;
+if (pairs[1].value & RISCV_HWPROBE_IMA_FD)
+ret |= AV_CPU_FLAG_RVF | AV_CPU_FLAG_RVD;
+# ifdef RISCV_HWPROBE_IMA_V
+if (pairs[1].value & RISCV_HWPROBE_IMA_V)
+ret |= AV_CPU_FLAG_RVV_I32 | AV_CPU_FLAG_RVV_I64
+ | AV_CPU_FLAG_RVV_F32 | AV_CPU_FLAG_RVV_F64;
+# endif
+# ifdef RISCV_HWPROBE_EXT_ZBA
+if (pairs[1].value & RISCV_HWPROBE_EXT_ZBA)
+ret |= AV_CPU_FLAG_RVB_ADDR;
+# endif
+# ifdef RISCV_HWPROBE_EXT_ZBB
+if (pairs[1].value & RISCV_HWPROBE_EXT_ZBB)
+ret |= AV_CPU_FLAG_RVB_BASIC;
+# endif
+} else
+#endif
 #if HAVE_GETAUXVAL
 {
 const unsigned long hwcap = getauxval(AT_HWCAP);
-- 
2.42.0

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH] lavu/fixed_dsp: R-V V fmul_window_scaled

2023-11-19 Thread Rémi Denis-Courmont

vector_fmul_window_scaled_fixed_c:   4393.7
vector_fmul_window_scaled_fixed_rvv_i64: 1642.7
---
 libavutil/riscv/fixed_dsp_init.c |  7 -
 libavutil/riscv/fixed_dsp_rvv.S  | 48 
 2 files changed, 54 insertions(+), 1 deletion(-)

diff --git a/libavutil/riscv/fixed_dsp_init.c b/libavutil/riscv/fixed_dsp_init.c
index 6469b45374..cd318af486 100644
--- a/libavutil/riscv/fixed_dsp_init.c
+++ b/libavutil/riscv/fixed_dsp_init.c
@@ -25,6 +25,9 @@
 #include "libavutil/cpu.h"
 #include "libavutil/fixed_dsp.h"
 
+void ff_vector_fmul_window_scaled_rvv(int16_t *dst, const int32_t *src0,
+  const int32_t *src1, const int32_t *win,
+  int len, uint8_t bits);
 void ff_vector_fmul_window_fixed_rvv(int32_t *dst, const int32_t *src0,
  const int32_t *src1, const int32_t *win,
  int len);
@@ -43,8 +46,10 @@ av_cold void ff_fixed_dsp_init_riscv(AVFixedDSPContext *fdsp)
 int flags = av_get_cpu_flags();
 
 if ((flags & AV_CPU_FLAG_RVV_I32) && (flags & AV_CPU_FLAG_RVB_ADDR)) {
-if (flags & AV_CPU_FLAG_RVV_I64)
+if (flags & AV_CPU_FLAG_RVV_I64) {
+fdsp->vector_fmul_window_scaled = ff_vector_fmul_window_scaled_rvv;
 fdsp->vector_fmul_window = ff_vector_fmul_window_fixed_rvv;
+}
 
 fdsp->vector_fmul = ff_vector_fmul_fixed_rvv;
 fdsp->vector_fmul_reverse = ff_vector_fmul_reverse_fixed_rvv;
diff --git a/libavutil/riscv/fixed_dsp_rvv.S b/libavutil/riscv/fixed_dsp_rvv.S
index 68de6d7e1b..6bac5813b8 100644
--- a/libavutil/riscv/fixed_dsp_rvv.S
+++ b/libavutil/riscv/fixed_dsp_rvv.S
@@ -20,6 +20,54 @@
 
 #include "asm.S"
 
+func ff_vector_fmul_window_scaled_rvv, zve64x
+csrwi   vxrm, 0
+vsetvli t0, zero, e16, m1, ta, ma
+sh2add  a2, a4, a2
+vid.v   v0
+sh3add  t3, a4, a3
+vadd.vi v0, v0, 1
+sh2add  t0, a4, a0
+1:
+vsetvli t2, a4, e16, m1, ta, ma
+sllit4, t2, 2
+sllit1, t2, 1
+vrsub.vx v2, v0, t2
+sub t3, t3, t4
+vsetvli zero, zero, e32, m2, ta, ma
+sub a2, a2, t4
+vle32.v v8, (t3)
+sub t0, t0, t1
+vle32.v v4, (a2)
+sub a4, a4, t2
+vrgatherei16.vv v28, v8, v2
+vle32.v v16, (a1)
+add a1, a1, t4
+vrgatherei16.vv v20, v4, v2
+vle32.v v24, (a3)
+add a3, a3, t4
+vwmul.vv v12, v16, v28
+vwmul.vv v8, v16, v24
+// vwnmsac.vv does _not_ exist so multiply & subtract separately
+vwmul.vv v4, v20, v24
+vwmacc.vv v8, v20, v28
+vsetvli zero, zero, e64, m4, ta, ma
+vsub.vv v12, v12, v4
+vsetvli zero, zero, e32, m2, ta, ma
+vnclip.wi v16, v8, 31
+vnclip.wi v20, v12, 31
+vsetvli zero, zero, e16, m1, ta, ma
+vnclip.wx v8, v16, a5
+vnclip.wx v12, v20, a5
+vrgatherei16.vv v16, v8, v2
+vse16.v v12, (a0)
+add a0, a0, t1
+vse16.v v16, (t0)
+bneza4, 1b
+
+ret
+endfunc
+
 func ff_vector_fmul_window_fixed_rvv, zve64x
 csrwi   vxrm, 0
 vsetvli t0, zero, e16, m1, ta, ma
-- 
2.42.0

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH] lavc/g722dsp: optimise R-V V apply_qmf

2023-11-19 Thread Rémi Denis-Courmont

This stores the constant coefficients deinterleaved, so that they can be
loaded directly with NF=0. Unfortunately, we cannot optimise loading the
input, due to insufficient memory alignment (not 32-bit).

Before:
g722_apply_qmf_c:   82.5
g722_apply_qmf_rvv_i32: 78.2

After:
g722_apply_qmf_c:   82.5
g722_apply_qmf_rvv_i32: 65.2
---
 libavcodec/riscv/g722dsp_rvv.S | 24 +---
 1 file changed, 13 insertions(+), 11 deletions(-)

diff --git a/libavcodec/riscv/g722dsp_rvv.S b/libavcodec/riscv/g722dsp_rvv.S
index 350be8dc1f..981d5cecd8 100644
--- a/libavcodec/riscv/g722dsp_rvv.S
+++ b/libavcodec/riscv/g722dsp_rvv.S
@@ -24,7 +24,9 @@ func ff_g722_apply_qmf_rvv, zve32x
 lla t0, qmf_coeffs
 vsetivlizero, 12, e16, m2, ta, ma
 vlseg2e16.v v28, (a0)
-vlseg2e16.v v24, (t0)
+addit1, t0, 12 * 2
+vle16.v v24, (t0)
+vle16.v v26, (t1)
 vwmul.vvv16, v28, v24
 vwmul.vvv20, v30, v26
 vsetivlizero, 12, e32, m4, ta, ma
@@ -41,26 +43,26 @@ endfunc
 const qmf_coeffs, align=2
 .short 3
 .short   -11
-.short   -11
-.short53
 .short12
-.short  -156
 .short32
-.short   362
 .short  -210
-.short  -805
 .short   951
 .short  3876
-.short  3876
-.short   951
 .short  -805
-.short  -210
 .short   362
-.short32
 .short  -156
-.short12
 .short53
 .short   -11
 .short   -11
+.short53
+.short  -156
+.short   362
+.short  -805
+.short  3876
+.short   951
+.short  -210
+.short32
+.short12
+.short   -11
 .short 3
 endconst
-- 
2.42.0

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH 1/2] lavu/fixed_dsp: optimise R-V V fmul_reverse

2023-11-19 Thread Rémi Denis-Courmont

Gathers are (unsurprisingly) a notable exception to the rule that R-V V
gets faster with larger group multipliers. So roll the function to speed
it up.

Before:
vector_fmul_reverse_fixed_c:   2840.7
vector_fmul_reverse_fixed_rvv_i32: 2430.2

After:
vector_fmul_reverse_fixed_c:   2841.0
vector_fmul_reverse_fixed_rvv_i32:  962.2

It might be possible to further optimise the function by moving the
reverse-subtract out of the loop and adding ad-hoc tail handling.
---
 libavutil/riscv/fixed_dsp_rvv.S | 7 ---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/libavutil/riscv/fixed_dsp_rvv.S b/libavutil/riscv/fixed_dsp_rvv.S
index 2bece88685..46bb591352 100644
--- a/libavutil/riscv/fixed_dsp_rvv.S
+++ b/libavutil/riscv/fixed_dsp_rvv.S
@@ -127,16 +127,17 @@ endfunc
 
 func ff_vector_fmul_reverse_fixed_rvv, zve32x
 csrwi   vxrm, 0
-vsetvli t0, zero, e16, m4, ta, ma
+// e16/m4 and e32/m8 are possible but slow the gathers down.
+vsetvli t0, zero, e16, m1, ta, ma
 sh2add  a2, a3, a2
 vid.v   v0
 vadd.vi v0, v0, 1
 1:
-vsetvli t0, a3, e16, m4, ta, ma
+vsetvli t0, a3, e16, m1, ta, ma
 sllit1, t0, 2
 vrsub.vx v4, v0, t0 // v4[i] = [VL-1, VL-2... 1, 0]
 sub a2, a2, t1
-vsetvli zero, zero, e32, m8, ta, ma
+vsetvli zero, zero, e32, m2, ta, ma
 vle32.v v8, (a2)
 sub a3, a3, t0
 vle32.v v16, (a1)
-- 
2.42.0

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH 2/2] lavu/float_dsp: optimise R-V V fmul_reverse & fmul_window

2023-11-19 Thread Rémi Denis-Courmont

Roll the loop to avoid slow gathers.

Before:
vector_fmul_reverse_c:   1561.7
vector_fmul_reverse_rvv_f32: 2410.2
vector_fmul_window_c:2068.2
vector_fmul_window_rvv_f32:  1879.5

After:
vector_fmul_reverse_c:   1561.7
vector_fmul_reverse_rvv_f32:  916.2
vector_fmul_window_c:2068.2
vector_fmul_window_rvv_f32:  1202.5
---
 libavutil/riscv/float_dsp_rvv.S | 8 +---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/libavutil/riscv/float_dsp_rvv.S b/libavutil/riscv/float_dsp_rvv.S
index a2f9488249..ce5b6823d4 100644
--- a/libavutil/riscv/float_dsp_rvv.S
+++ b/libavutil/riscv/float_dsp_rvv.S
@@ -75,18 +75,19 @@ endfunc
 
 func ff_vector_fmul_window_rvv, zve32f
 // a0: dst, a1: src0, a2: src1, a3: window, a4: length
-vsetvlit0, zero, e16, m2, ta, ma
+// e16/m2 and e32/m4 are possible but slower due to gather.
+vsetvlit0, zero, e16, m1, ta, ma
 sh2add a2, a4, a2
 vid.v  v0
 sh3add t3, a4, a3
 vadd.viv0, v0, 1
 sh3add t0, a4, a0
 1:
-vsetvlit2, a4, e16, m2, ta, ma
+vsetvlit2, a4, e16, m1, ta, ma
 slli   t4, t2, 2
 vrsub.vx   v2, v0, t2
 subt3, t3, t4
-vsetvlizero, zero, e32, m4, ta, ma
+vsetvlizero, zero, e32, m2, ta, ma
 suba2, a2, t4
 vle32.vv8, (t3)
 subt0, t0, t4
@@ -133,6 +134,7 @@ endfunc
 // TODO factor vrsub, separate last iteration?
 // (a0) = (a1) * reverse(a2) [0..a3-1]
 func ff_vector_fmul_reverse_rvv, zve32f
+// e16/m4 and e32/m8 are possible but slower due to gather.
 vsetvli  t0, zero, e16, m4, ta, ma
 sh2add   a2, a3, a2
 vid.vv0
-- 
2.42.0

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH] lavc/llvidencdsp: add R-V V diff_bytes

2023-11-19 Thread Rémi Denis-Courmont

diff_bytes_c:  163.0
diff_bytes_rvv_i32: 52.7
---
 libavcodec/lossless_videoencdsp.c   |  4 ++-
 libavcodec/lossless_videoencdsp.h   |  1 +
 libavcodec/riscv/Makefile   |  2 ++
 libavcodec/riscv/llvidencdsp_init.c | 39 +
 libavcodec/riscv/llvidencdsp_rvv.S  | 37 +++
 5 files changed, 82 insertions(+), 1 deletion(-)
 create mode 100644 libavcodec/riscv/llvidencdsp_init.c
 create mode 100644 libavcodec/riscv/llvidencdsp_rvv.S

diff --git a/libavcodec/lossless_videoencdsp.c 
b/libavcodec/lossless_videoencdsp.c
index b4130ebc7b..e2dc99e201 100644
--- a/libavcodec/lossless_videoencdsp.c
+++ b/libavcodec/lossless_videoencdsp.c
@@ -94,7 +94,9 @@ av_cold void ff_llvidencdsp_init(LLVidEncDSPContext *c)
 c->sub_median_pred = sub_median_pred_c;
 c->sub_left_predict = sub_left_predict_c;
 
-#if ARCH_X86
+#if ARCH_RISCV
+ff_llvidencdsp_init_riscv(c);
+#elif ARCH_X86
 ff_llvidencdsp_init_x86(c);
 #endif
 }
diff --git a/libavcodec/lossless_videoencdsp.h 
b/libavcodec/lossless_videoencdsp.h
index f2c2878485..07fff584af 100644
--- a/libavcodec/lossless_videoencdsp.h
+++ b/libavcodec/lossless_videoencdsp.h
@@ -40,6 +40,7 @@ typedef struct LLVidEncDSPContext {
 } LLVidEncDSPContext;
 
 void ff_llvidencdsp_init(LLVidEncDSPContext *c);
+void ff_llvidencdsp_init_riscv(LLVidEncDSPContext *c);
 void ff_llvidencdsp_init_x86(LLVidEncDSPContext *c);
 
 #endif /* AVCODEC_LOSSLESS_VIDEOENCDSP_H */
diff --git a/libavcodec/riscv/Makefile b/libavcodec/riscv/Makefile
index 8f2a519827..2d0e6c19c8 100644
--- a/libavcodec/riscv/Makefile
+++ b/libavcodec/riscv/Makefile
@@ -30,6 +30,8 @@ OBJS-$(CONFIG_LLAUDDSP) += riscv/llauddsp_init.o
 RVV-OBJS-$(CONFIG_LLAUDDSP) += riscv/llauddsp_rvv.o
 OBJS-$(CONFIG_LLVIDDSP) += riscv/llviddsp_init.o
 RVV-OBJS-$(CONFIG_LLVIDDSP) += riscv/llviddsp_rvv.o
+OBJS-$(CONFIG_LLVIDENCDSP) += riscv/llvidencdsp_init.o
+RVV-OBJS-$(CONFIG_LLVIDENCDSP) += riscv/llvidencdsp_rvv.o
 OBJS-$(CONFIG_OPUS_DECODER) += riscv/opusdsp_init.o
 RVV-OBJS-$(CONFIG_OPUS_DECODER) += riscv/opusdsp_rvv.o
 OBJS-$(CONFIG_PIXBLOCKDSP) += riscv/pixblockdsp_init.o \
diff --git a/libavcodec/riscv/llvidencdsp_init.c 
b/libavcodec/riscv/llvidencdsp_init.c
new file mode 100644
index 00..e35406dc41
--- /dev/null
+++ b/libavcodec/riscv/llvidencdsp_init.c
@@ -0,0 +1,39 @@
+/*
+ * Copyright © 2023 Rémi Denis-Courmont.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavcodec/lossless_videoencdsp.h"
+
+void ff_llvidenc_diff_bytes_rvv(uint8_t *dst, const uint8_t *src1,
+const uint8_t *src2, intptr_t w);
+
+av_cold void ff_llvidencdsp_init_riscv(LLVidEncDSPContext *c)
+{
+#if HAVE_RVV
+int flags = av_get_cpu_flags();
+
+if (flags & AV_CPU_FLAG_RVV_I32) {
+c->diff_bytes = ff_llvidenc_diff_bytes_rvv;
+}
+#endif
+}
diff --git a/libavcodec/riscv/llvidencdsp_rvv.S 
b/libavcodec/riscv/llvidencdsp_rvv.S
new file mode 100644
index 00..0342165127
--- /dev/null
+++ b/libavcodec/riscv/llvidencdsp_rvv.S
@@ -0,0 +1,37 @@
+/*
+ * Copyright © 2023 Rémi Denis-Courmont.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/riscv/asm.S"
+
+func ff_llvidenc_diff_bytes_rvv, zve32x
+1:
+vsetvli t0, a3, e8, m8, ta, ma
+vle8.v  v0, (a1)
+sub a3, a3, t0
+vle8.v  v8, (a2)
+add

[FFmpeg-devel] [PATCH] lavc/aacpsdsp: use LMUL=2 and amortise strides

2023-11-19 Thread Rémi Denis-Courmont

The input is laid out in 16 segments, of which 13 actually need to be
loaded. There are no really efficient ways to deal with this:
1) If we load 8 segments wit unit stride, then narrow to 16 segments with
   right shifts, we can only one half-size vector per segment, or just 2
   elements per vector (EMUL=1/2). This ends up unsurprisingly about as fas
   as the C code.
2) The current approach is to load with strides. We keep that approach,
   but improve it using three 4-segmented loads instead of 12 single-segment
   loads. This divides the number of distinct loaded addresses by 4.
3) A potential third approach would be to avoid segmentation altogether
   and splat the scalar coefficient into vectors. Then we can use a
   unit-stride and maximum EMUL. But the downside then is that we have to
   multiply the 3 (of 16) unused segments with zero as part of the
   multiply-accumulate operations.

In addition, we also reuse vectors mid-loop so as to increase the EMUL
from 1 to 2, which also improves performance a little bit.

Oeverall the gains are quite small with the device under test, as it does
not deal with segmented loads very well. But at least the code is tidier,
and should enjoy bigger speed-ups on better hardware implementation.

Before:
ps_hybrid_analysis_c:   1819.2
ps_hybrid_analysis_rvv_f32: 1037.0 (before)
ps_hybrid_analysis_rvv_f32:  990.0 (after)
---
 libavcodec/riscv/aacpsdsp_rvv.S | 61 +++--
 1 file changed, 20 insertions(+), 41 deletions(-)

diff --git a/libavcodec/riscv/aacpsdsp_rvv.S b/libavcodec/riscv/aacpsdsp_rvv.S
index 1dc426e01c..f46b35fe91 100644
--- a/libavcodec/riscv/aacpsdsp_rvv.S
+++ b/libavcodec/riscv/aacpsdsp_rvv.S
@@ -85,63 +85,42 @@ NOHWD   fsw fs\n, (4 * \n)(sp)
 flw fs4, (4 * ((6 * 2) + 0))(a1)
 flw fs5, (4 * ((6 * 2) + 1))(a1)
 
-adda2, a2, 6 * 2 * 4 // point to filter[i][6][0]
+add t2, a2, 6 * 2 * 4 // point to filter[i][6][0]
 li t4, 8 * 2 * 4 // filter byte stride
 slli   a3, a3, 3 // output byte stride
 1:
 .macro filter, vs0, vs1, fo0, fo1, fo2, fo3
 vfmacc.vf  v8, \fo0, \vs0
-vfmacc.vf  v9, \fo2, \vs0
+vfmacc.vf  v10, \fo2, \vs0
 vfnmsac.vf v8, \fo1, \vs1
-vfmacc.vf  v9, \fo3, \vs1
+vfmacc.vf  v10, \fo3, \vs1
 .endm
 
-vsetvlit0, a4, e32, m1, ta, ma
+vsetvlit0, a4, e32, m2, ta, ma
 /*
  * The filter (a2) has 16 segments, of which 13 need to be extracted.
  * R-V V supports only up to 8 segments, so unrolling is unavoidable.
  */
-addi   t1, a2, -48
-vlse32.v   v22, (a2), t4
-addi   t2, a2, -44
-vlse32.v   v16, (t1), t4
-addi   t1, a2, -40
-vfmul.vf   v8, v22, fs4
-vlse32.v   v24, (t2), t4
-addi   t2, a2, -36
-vfmul.vf   v9, v22, fs5
-vlse32.v   v17, (t1), t4
-addi   t1, a2, -32
-vlse32.v   v25, (t2), t4
-addi   t2, a2, -28
-filter v16, v24, ft0, ft1, ft2, ft3
-vlse32.v   v18, (t1), t4
-addi   t1, a2, -24
-vlse32.v   v26, (t2), t4
-addi   t2, a2, -20
-filter v17, v25, ft4, ft5, ft6, ft7
-vlse32.v   v19, (t1), t4
-addi   t1, a2, -16
-vlse32.v   v27, (t2), t4
-addi   t2, a2, -12
-filter v18, v26, ft8, ft9, ft10, ft11
-vlse32.v   v20, (t1), t4
-addi   t1, a2, -8
 vlse32.v   v28, (t2), t4
-addi   t2, a2, -4
-filter v19, v27, fa0, fa1, fa2, fa3
-vlse32.v   v21, (t1), t4
+addi   t1, a2, 16
+vfmul.vf   v8, v28, fs4
+vlsseg4e32.v v16, (a2), t4
+vfmul.vf   v10, v28, fs5
+filter v16, v18, ft0, ft1, ft2, ft3
+vlsseg4e32.v v24, (t1), t4
+filter v20, v22, ft4, ft5, ft6, ft7
+addi   t1, a2, 32
+filter v24, v26, ft8, ft9, ft10, ft11
+vlsseg4e32.v v16, (t1), t4
 suba4, a4, t0
-vlse32.v   v29, (t2), t4
+filter v28, v30, fa0, fa1, fa2, fa3
 slli   t1, t0, 3 + 1 + 2 // ctz(8 * 2 * 4)
-adda2, a2, t1
-filter v20, v28, fa4, fa5, fa6, fa7
-filter v21, v29, fs0, fs1, fs2, fs3
-
-addt2, a0, 4
-vsse32.v   v8, (a0), a3
+filter v16, v18, fa4, fa5, fa6, fa7
 mult0, t0, a3
-vsse32.v   v9, (t2), a3
+filter v20, v22, fs0, fs1, fs2, fs3
+adda2, a2, t1
+addt2, t2, t1
+vssseg2e32.v v8, (a0), a3
 adda0, a0, t0
 bnez   a4, 1b
 
-- 
2.42.0

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email

[FFmpeg-devel] [PATCH] riscv: set fast half-precision conversion

2023-11-16 Thread Rémi Denis-Courmont

This is only supported at compilation time. If Zfhmin is supported, then
conversions are fast, which is what the flag is used for. At this time,
run-tiem detection is not possible, as in not supported by Linux. But even
if it were, the current FFmpeg approach seems unable to deal with it (same
problem as on x86, really).
---
 configure | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/configure b/configure
index d6e4a1e7df..6be849fc08 100755
--- a/configure
+++ b/configure
@@ -5413,6 +5413,9 @@ elif enabled riscv; then
 if test_cpp_condition stddef.h "__riscv_zbb"; then
 enable fast_clz
 fi
+if test_cpp_condition stddef.h "__riscv_zfhmin"; then
+enable fast_float16
+fi
 
 elif enabled sparc; then
 
-- 
2.42.0

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH] checkasm/riscv: use t0 as alternative link register

2023-11-16 Thread Rémi Denis-Courmont

The unprivileged ISA specification says that either RA or T0 should be
used for this purpose. Other registers may confuse the return address
prediction stack.
---
 tests/checkasm/riscv/checkasm.S | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/checkasm/riscv/checkasm.S b/tests/checkasm/riscv/checkasm.S
index 73ca85f344..b902ab1043 100644
--- a/tests/checkasm/riscv/checkasm.S
+++ b/tests/checkasm/riscv/checkasm.S
@@ -123,10 +123,10 @@ func checkasm_get_wrapper, v
 
 /* Call the tested function */
 la.tls.ie t0, checked_func
-add t0, tp, t0
-ld  t1, (t0)
-sd  zero, (t0)
-jalrt1
+add t1, tp, t0
+ld  t0, (t1)
+sd  zero, (t1)
+jalrt0
 
 /* Check special register values */
 la.tls.ie t0, saved_regs
-- 
2.42.0

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH 1/2] checkasm: add helper to report a fatal signal

2023-11-16 Thread Rémi Denis-Courmont

---
 tests/checkasm/checkasm.c | 15 +++
 tests/checkasm/checkasm.h |  1 +
 2 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c
index 708119e7c6..c67cf58922 100644
--- a/tests/checkasm/checkasm.c
+++ b/tests/checkasm/checkasm.c
@@ -23,10 +23,8 @@
 #include "config.h"
 #include "config_components.h"
 
-#if CONFIG_LINUX_PERF
-# ifndef _GNU_SOURCE
-#  define _GNU_SOURCE // for syscall (performance monitoring API)
-# endif
+#ifndef _GNU_SOURCE
+# define _GNU_SOURCE // for syscall (performance monitoring API), strsignal()
 #endif
 
 #include 
@@ -863,6 +861,15 @@ void checkasm_fail_func(const char *msg, ...)
 }
 }
 
+void checkasm_fail_signal(int signum)
+{
+#ifdef __GLIBC__
+checkasm_fail_func("fatal signal %d: %s", signum, strsignal(signum));
+#else
+checkasm_fail_func("fatal signal %d", signum);
+#endif
+}
+
 /* Get the benchmark context of the current function */
 CheckasmPerf *checkasm_get_perf_context(void)
 {
diff --git a/tests/checkasm/checkasm.h b/tests/checkasm/checkasm.h
index cfea868ff1..8a1df43ab6 100644
--- a/tests/checkasm/checkasm.h
+++ b/tests/checkasm/checkasm.h
@@ -102,6 +102,7 @@ struct CheckasmPerf;
 void *checkasm_check_func(void *func, const char *name, ...) 
av_printf_format(2, 3);
 int checkasm_bench_func(void);
 void checkasm_fail_func(const char *msg, ...) av_printf_format(1, 2);
+void checkasm_fail_signal(int signum);
 struct CheckasmPerf *checkasm_get_perf_context(void);
 void checkasm_report(const char *name, ...) av_printf_format(1, 2);
 
-- 
2.42.0

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH 2/2] checkasm/riscv: report an error upon SIGILL

2023-11-16 Thread Rémi Denis-Courmont

Terminating the whole checkasm process is not very helpful. This will
report if an illegal instruction occurs while executing a tested
function. This is a common occurrence whilst developping RISC-V
assembler, due to the compatibility between vector configuration and
instruction done at run-time.
---
 tests/checkasm/checkasm.c   |  9 +
 tests/checkasm/checkasm.h   | 11 +--
 tests/checkasm/riscv/checkasm.S | 12 
 3 files changed, 30 insertions(+), 2 deletions(-)

diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c
index c67cf58922..a15e801caf 100644
--- a/tests/checkasm/checkasm.c
+++ b/tests/checkasm/checkasm.c
@@ -27,6 +27,7 @@
 # define _GNU_SOURCE // for syscall (performance monitoring API), strsignal()
 #endif
 
+#include 
 #include 
 #include 
 #include 
@@ -734,6 +735,14 @@ int main(int argc, char *argv[])
 if (have_vfp(av_get_cpu_flags()) || have_neon(av_get_cpu_flags()))
 checkasm_checked_call = checkasm_checked_call_vfp;
 #endif
+#if ARCH_RISCV
+struct sigaction act = {
+.sa_handler = checkasm_handle_signal,
+.sa_flags = 0,
+};
+
+sigaction(SIGILL, , NULL);
+#endif
 
 if (!tests[0].func || !cpus[0].flag) {
 fprintf(stderr, "checkasm: no tests to perform\n");
diff --git a/tests/checkasm/checkasm.h b/tests/checkasm/checkasm.h
index 8a1df43ab6..61734a8dbb 100644
--- a/tests/checkasm/checkasm.h
+++ b/tests/checkasm/checkasm.h
@@ -23,6 +23,7 @@
 #ifndef TESTS_CHECKASM_CHECKASM_H
 #define TESTS_CHECKASM_CHECKASM_H
 
+#include 
 #include 
 #include "config.h"
 
@@ -211,14 +212,20 @@ void checkasm_checked_call(void *func, ...);
   checked_call(func_new, 0, 0, 0, 0, 0, 0, 0, __VA_ARGS__,\
7, 6, 5, 4, 3, 2, 1, 0, 0, 0, 0, 0, 0, 0, 
0))
 #elif ARCH_RISCV
-void checkasm_set_function(void *);
+void checkasm_set_function(void *, sigjmp_buf);
 void *checkasm_get_wrapper(void);
+void checkasm_handle_signal(int signum);
 
 #if (__riscv_xlen == 64) && defined (__riscv_d)
 #define declare_new(ret, ...) \
+int checked_call_signum = 0; \
+sigjmp_buf checked_call_jb; \
 ret (*checked_call)(__VA_ARGS__) = checkasm_get_wrapper();
 #define call_new(...) \
-(checkasm_set_function(func_new), checked_call(__VA_ARGS__))
+(checkasm_set_function(func_new, checked_call_jb), \
+ (checked_call_signum = sigsetjmp(checked_call_jb, 1)) == 0 \
+? checked_call(__VA_ARGS__) \
+   : (checkasm_fail_signal(checked_call_signum), 0))
 #endif
 #else
 #define declare_new(ret, ...)
diff --git a/tests/checkasm/riscv/checkasm.S b/tests/checkasm/riscv/checkasm.S
index b902ab1043..30d3f3d8bb 100644
--- a/tests/checkasm/riscv/checkasm.S
+++ b/tests/checkasm/riscv/checkasm.S
@@ -41,6 +41,7 @@ endconst
 
 checked_func:
 .quad   0
+.quad   0
 
 saved_regs:
 /* Space to spill RA, SP, GP, TP, S0-S11 and FS0-FS11 */
@@ -52,6 +53,7 @@ func checkasm_set_function
 la.tls.ie t0, checked_func
 add t0, tp, t0
 sd  a0, (t0)
+sd  a1, 8(t0)
 ret
 endfunc
 
@@ -175,4 +177,14 @@ func checkasm_get_wrapper, v
 callcheckasm_fail_func
 j   4b
 endfunc
+
+func checkasm_handle_signal
+mv  a1, a0
+la.tls.ie a0, checked_func
+add a0, tp, a0
+ld  a0, 8(a0)
+beqza0, 8f
+tailsiglongjmp
+8:  tailabort /* No jump buffer to go to */
+endfunc
 #endif
-- 
2.42.0

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH] checkasm/riscv: use t0 as alternative link register

2023-11-16 Thread Rémi Denis-Courmont

Le torstaina 16. marraskuuta 2023, 18.04.51 EET Rémi Denis-Courmont a écrit :
> The unprivileged ISA specification says that either RA or T0 should be
> used for this purpose. Other registers may confuse the return address
> prediction stack.

Need more sleep. This is true for the link register (the destination operand), 
not the branch target (the source operand). Please ignore.

-- 
雷米‧德尼-库尔蒙
http://www.remlab.net/



___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH] checkasm/flacdsp: fix ls/rs/ms tests

2023-11-13 Thread Rémi Denis-Courmont

decorrelate_ls, _rs and _ms are decorrelate[1], [2] and [3] respectively.
The code ended up testing indep ([0]) as twice, ms never, and misnaming
the other two.
---
 tests/checkasm/flacdsp.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/checkasm/flacdsp.c b/tests/checkasm/flacdsp.c
index ef93df8c81..51a0e0060b 100644
--- a/tests/checkasm/flacdsp.c
+++ b/tests/checkasm/flacdsp.c
@@ -78,7 +78,7 @@ void checkasm_check_flacdsp(void)
 for (i = 0; i < 2; i++) {
 ff_flacdsp_init(, fmts[i].fmt, 2);
 for (j = 0; j < 3; j++)
-if (check_func(h.decorrelate[j], "flac_decorrelate_%s_%d", 
names[j], fmts[i].bits))
+if (check_func(h.decorrelate[j + 1], "flac_decorrelate_%s_%d", 
names[j], fmts[i].bits))
 check_decorrelate(_dst, ref_src, _dst, new_src, 2, 
fmts[i].bits);
 for (j = 2; j <= MAX_CHANNELS; j += 2) {
 ff_flacdsp_init(, fmts[i].fmt, j);
-- 
2.42.0

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH 1/2] lavc/flacdsp: R-V V packed decorrelate_{l, r}s

2023-11-13 Thread Rémi Denis-Courmont

flac_decorrelate_ms_16_c:   457.2
flac_decorrelate_ms_16_rvv_i32: 203.0
flac_decorrelate_ms_32_c:   457.2
flac_decorrelate_ms_32_rvv_i32: 203.5
flac_decorrelate_rs_16_c:   456.2
flac_decorrelate_rs_16_rvv_i32: 207.0
flac_decorrelate_rs_32_c:   456.2
flac_decorrelate_rs_32_rvv_i32: 210.5
---
 libavcodec/flacdsp.c|   2 +
 libavcodec/flacdsp.h|   1 +
 libavcodec/riscv/Makefile   |   2 +
 libavcodec/riscv/flacdsp_init.c |  55 
 libavcodec/riscv/flacdsp_rvv.S  | 113 
 5 files changed, 173 insertions(+)
 create mode 100644 libavcodec/riscv/flacdsp_init.c
 create mode 100644 libavcodec/riscv/flacdsp_rvv.S

diff --git a/libavcodec/flacdsp.c b/libavcodec/flacdsp.c
index 42e231db53..71b4ac44aa 100644
--- a/libavcodec/flacdsp.c
+++ b/libavcodec/flacdsp.c
@@ -121,6 +121,8 @@ av_cold void ff_flacdsp_init(FLACDSPContext *c, enum 
AVSampleFormat fmt, int cha
 
 #if ARCH_ARM
 ff_flacdsp_init_arm(c, fmt, channels);
+#elif ARCH_RISCV
+ff_flacdsp_init_riscv(c, fmt, channels);
 #elif ARCH_X86
 ff_flacdsp_init_x86(c, fmt, channels);
 #endif
diff --git a/libavcodec/flacdsp.h b/libavcodec/flacdsp.h
index 9f8ed38b66..15149c026e 100644
--- a/libavcodec/flacdsp.h
+++ b/libavcodec/flacdsp.h
@@ -38,6 +38,7 @@ typedef struct FLACDSPContext {
 
 void ff_flacdsp_init(FLACDSPContext *c, enum AVSampleFormat fmt, int channels);
 void ff_flacdsp_init_arm(FLACDSPContext *c, enum AVSampleFormat fmt, int 
channels);
+void ff_flacdsp_init_riscv(FLACDSPContext *c, enum AVSampleFormat fmt, int 
channels);
 void ff_flacdsp_init_x86(FLACDSPContext *c, enum AVSampleFormat fmt, int 
channels);
 
 #endif /* AVCODEC_FLACDSP_H */
diff --git a/libavcodec/riscv/Makefile b/libavcodec/riscv/Makefile
index 57c1708dbb..d34dc77458 100644
--- a/libavcodec/riscv/Makefile
+++ b/libavcodec/riscv/Makefile
@@ -12,6 +12,8 @@ OBJS-$(CONFIG_BSWAPDSP) += riscv/bswapdsp_init.o \
 RVV-OBJS-$(CONFIG_BSWAPDSP) += riscv/bswapdsp_rvv.o
 OBJS-$(CONFIG_EXR_DECODER) += riscv/exrdsp_init.o
 RVV-OBJS-$(CONFIG_EXR_DECODER) += riscv/exrdsp_rvv.o
+OBJS-$(CONFIG_FLAC_DECODER) += riscv/flacdsp_init.o
+RVV-OBJS-$(CONFIG_FLAC_DECODER) += riscv/flacdsp_rvv.o
 OBJS-$(CONFIG_FMTCONVERT) += riscv/fmtconvert_init.o
 RVV-OBJS-$(CONFIG_FMTCONVERT) += riscv/fmtconvert_rvv.o
 OBJS-$(CONFIG_G722DSP) += riscv/g722dsp_init.o
diff --git a/libavcodec/riscv/flacdsp_init.c b/libavcodec/riscv/flacdsp_init.c
new file mode 100644
index 00..a3415d6d55
--- /dev/null
+++ b/libavcodec/riscv/flacdsp_init.c
@@ -0,0 +1,55 @@
+/*
+ * Copyright © 2023 Rémi Denis-Courmont.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavcodec/flacdsp.h"
+
+void ff_flac_decorrelate_ls_16_rvv(uint8_t **out, int32_t **in,
+   int channels, int len, int shift);
+void ff_flac_decorrelate_rs_16_rvv(uint8_t **out, int32_t **in,
+   int channels, int len, int shift);
+void ff_flac_decorrelate_ls_32_rvv(uint8_t **out, int32_t **in,
+   int channels, int len, int shift);
+void ff_flac_decorrelate_rs_32_rvv(uint8_t **out, int32_t **in,
+   int channels, int len, int shift);
+
+av_cold void ff_flacdsp_init_riscv(FLACDSPContext *c, enum AVSampleFormat fmt,
+   int channels)
+{
+#if HAVE_RVV
+int flags = av_get_cpu_flags();
+
+if ((flags & AV_CPU_FLAG_RVV_I32) && (flags & AV_CPU_FLAG_RVB_ADDR)) {
+switch (fmt) {
+case AV_SAMPLE_FMT_S16:
+c->decorrelate[1] = ff_flac_decorrelate_ls_16_rvv;
+c->decorrelate[2] = ff_flac_decorrelate_rs_16_rvv;
+break;
+case AV_SAMPLE_FMT_S32:
+c->decorrelate[1] = ff_flac_decorrelate_ls_32_rvv;
+c->decorrelate[2] = ff_flac_decorrelate_rs_32_rvv;
+break;
+}
+}
+#endif
+}
diff --git a/libavcodec/riscv/flacdsp_rvv.S b/libavcodec/riscv/flacdsp_rvv.S
new file mode 100644
index 00..c70ad8fcb0
--- /dev/null
+++ b/libavcodec/riscv/flacdsp_

[FFmpeg-devel] [PATCH 2/2] lavc/flacdsp: R-V V decorrelate_ms packed

2023-11-13 Thread Rémi Denis-Courmont

flac_decorrelate_ms_16_c:   585.5
flac_decorrelate_ms_16_rvv_i32: 263.0
flac_decorrelate_ms_32_c:   584.7
flac_decorrelate_ms_32_rvv_i32: 250.0
---
 libavcodec/riscv/flacdsp_init.c |  6 
 libavcodec/riscv/flacdsp_rvv.S  | 49 +
 2 files changed, 55 insertions(+)

diff --git a/libavcodec/riscv/flacdsp_init.c b/libavcodec/riscv/flacdsp_init.c
index a3415d6d55..0e7be25d98 100644
--- a/libavcodec/riscv/flacdsp_init.c
+++ b/libavcodec/riscv/flacdsp_init.c
@@ -28,10 +28,14 @@ void ff_flac_decorrelate_ls_16_rvv(uint8_t **out, int32_t 
**in,
int channels, int len, int shift);
 void ff_flac_decorrelate_rs_16_rvv(uint8_t **out, int32_t **in,
int channels, int len, int shift);
+void ff_flac_decorrelate_ms_16_rvv(uint8_t **out, int32_t **in,
+   int channels, int len, int shift);
 void ff_flac_decorrelate_ls_32_rvv(uint8_t **out, int32_t **in,
int channels, int len, int shift);
 void ff_flac_decorrelate_rs_32_rvv(uint8_t **out, int32_t **in,
int channels, int len, int shift);
+void ff_flac_decorrelate_ms_32_rvv(uint8_t **out, int32_t **in,
+   int channels, int len, int shift);
 
 av_cold void ff_flacdsp_init_riscv(FLACDSPContext *c, enum AVSampleFormat fmt,
int channels)
@@ -44,10 +48,12 @@ av_cold void ff_flacdsp_init_riscv(FLACDSPContext *c, enum 
AVSampleFormat fmt,
 case AV_SAMPLE_FMT_S16:
 c->decorrelate[1] = ff_flac_decorrelate_ls_16_rvv;
 c->decorrelate[2] = ff_flac_decorrelate_rs_16_rvv;
+c->decorrelate[3] = ff_flac_decorrelate_ms_16_rvv;
 break;
 case AV_SAMPLE_FMT_S32:
 c->decorrelate[1] = ff_flac_decorrelate_ls_32_rvv;
 c->decorrelate[2] = ff_flac_decorrelate_rs_32_rvv;
+c->decorrelate[3] = ff_flac_decorrelate_ms_32_rvv;
 break;
 }
 }
diff --git a/libavcodec/riscv/flacdsp_rvv.S b/libavcodec/riscv/flacdsp_rvv.S
index c70ad8fcb0..616565ed7e 100644
--- a/libavcodec/riscv/flacdsp_rvv.S
+++ b/libavcodec/riscv/flacdsp_rvv.S
@@ -69,6 +69,32 @@ func ff_flac_decorrelate_rs_16_rvv, zve32x
 ret
 endfunc
 
+func ff_flac_decorrelate_ms_16_rvv, zve32x
+ld  a0,  (a0)
+ld  a2, 8(a1)
+ld  a1,  (a1)
+1:
+vsetvli t0, a3, e32, m8, ta, ma
+vle32.v v8, (a2)
+sub a3, a3, t0
+vle32.v v0, (a1)
+sh2add  a1, t0, a1
+vsra.vi v16, v8, 1
+sh2add  a2, t0, a2
+vsub.vv v24, v0, v16
+vadd.vv v16, v24, v8
+vsll.vx v8, v24, a4
+vsll.vx v0, v16, a4
+vsetvli zero, zero, e16, m4, ta, ma
+vncvt.x.x.w v0, v0
+vncvt.x.x.w v4, v8
+vsseg2e16.v v0, (a0)
+sh2add  a0, t0, a0
+bneza3, 1b
+
+ret
+endfunc
+
 func ff_flac_decorrelate_ls_32_rvv, zve32x
 ld  a0,  (a0)
 ld  a2, 8(a1)
@@ -110,4 +136,27 @@ func ff_flac_decorrelate_rs_32_rvv, zve32x
 
 ret
 endfunc
+
+func ff_flac_decorrelate_ms_32_rvv, zve32x
+ld  a0,  (a0)
+ld  a2, 8(a1)
+ld  a1,  (a1)
+1:
+vsetvli t0, a3, e32, m4, ta, ma
+vle32.v v4, (a2)
+sub a3, a3, t0
+vle32.v v0, (a1)
+sh2add  a1, t0, a1
+vsra.vi v8, v4, 1
+sh2add  a2, t0, a2
+vsub.vv v12, v0, v8
+vadd.vv v8, v12, v4
+vsll.vx v4, v12, a4
+vsll.vx v0, v8, a4
+vsseg2e32.v v0, (a0)
+sh3add  a0, t0, a0
+bneza3, 1b
+
+ret
+endfunc
 #endif
-- 
2.42.0

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

< 1 2 3 4 5 6 7 8 9 10 >

401 - 500 of 1026 matches

Mail list logo