pitrou commented on code in PR #45762:
URL: https://github.com/apache/arrow/pull/45762#discussion_r1993969842
##########
python/pyarrow/tests/test_compute.py:
##########
@@ -3838,3 +3838,22 @@ def test_pivot_wider():
with pytest.raises(ValueError, match="Encountered more than one non-null
value"):
result = pc.pivot_wider(["height", "width", "height"], [10, None, 11],
key_names=key_names)
+
+
[email protected]
Review Comment:
I don't think we need Pandas, we can just test against hardcoded results.
##########
cpp/src/arrow/compute/kernels/aggregate_test.cc:
##########
@@ -3693,6 +3693,11 @@ TEST_F(TestSkewKurtosis, Options) {
AssertSkewKurtosisInvalid(type, {"[]", "[]", "[]"}, options);
AssertSkewKurtosisAre(type, "[0, 1, null, 2]", options, 0.0, -1.5);
AssertSkewKurtosisAre(type, {"[0, 1]", "[]", "[null, 2]"}, options, 0.0,
-1.5);
+ options.bias = false;
+ AssertSkewKurtosisAre(type, {"[0, 1]", "[]", "[null, 2]"}, options, 0.0,
NAN);
Review Comment:
Are we sure we want to return NAN, or rather null since the situation is
similar to `min_count`?
##########
cpp/src/arrow/compute/kernels/aggregate_var_std_internal.h:
##########
@@ -84,14 +84,31 @@ struct Moments {
double Stddev(int ddof) const { return sqrt(Variance(ddof)); }
- double Skew() const {
+ double Skew(bool bias = true) const {
+ double result;
// This may return NaN for m2 == 0 and m3 == 0, which is expected
- return sqrt(count) * m3 / sqrt(m2 * m2 * m2);
+ // or if unbiased and not enough samples (count < 2).
+ if (bias) {
+ result = sqrt(count) * m3 / sqrt(m2 * m2 * m2);
+ } else {
+ result =
+ sqrt(count * (count - 1)) / (count - 2) * (m3 / count) / pow((m2 /
count), 1.5);
+ }
+ return result;
}
- double Kurtosis() const {
+ double Kurtosis(bool bias = true) const {
+ double result;
// This may return NaN for m2 == 0 and m4 == 0, which is expected
- return count * m4 / (m2 * m2) - 3;
+ // or if unbiased and not enough samples (count < 3).
+ if (bias) {
+ result = count * m4 / (m2 * m2) - 3;
+ } else {
+ result = 1.0 / (count - 2) / (count - 3) *
+ ((pow(count, 2) - 1.0) * (m4 / count) / pow((m2 / count), 2) -
Review Comment:
`count * count` is faster and more accurate than `pow(count, 2)`.
##########
cpp/src/arrow/compute/api_aggregate.h:
##########
@@ -117,13 +117,17 @@ class ARROW_EXPORT VarianceOptions : public
FunctionOptions {
/// \brief Control Skew and Kurtosis kernel behavior
class ARROW_EXPORT SkewOptions : public FunctionOptions {
public:
- explicit SkewOptions(bool skip_nulls = true, uint32_t min_count = 0);
+ explicit SkewOptions(bool skip_nulls = true, bool bias = true, uint32_t
min_count = 0);
static constexpr char const kTypeName[] = "SkewOptions";
static SkewOptions Defaults() { return SkewOptions{}; }
/// If true (the default), null values are ignored. Otherwise, if any value
is null,
/// emit null.
bool skip_nulls;
+ /// If true (the default), the calculated value is biased. If false, the
calculated
+ /// value includes a correction factor to reduce bias, making it more
accurate for
+ /// small sample sizes.
+ bool bias;
Review Comment:
Nit: I would probably call this `biased`
##########
cpp/src/arrow/compute/kernels/aggregate_test.cc:
##########
@@ -3693,6 +3693,11 @@ TEST_F(TestSkewKurtosis, Options) {
AssertSkewKurtosisInvalid(type, {"[]", "[]", "[]"}, options);
AssertSkewKurtosisAre(type, "[0, 1, null, 2]", options, 0.0, -1.5);
AssertSkewKurtosisAre(type, {"[0, 1]", "[]", "[null, 2]"}, options, 0.0,
-1.5);
+ options.bias = false;
+ AssertSkewKurtosisAre(type, {"[0, 1]", "[]", "[null, 2]"}, options, 0.0,
NAN);
Review Comment:
Also test when there are not enough values for unbiased skew?
##########
cpp/src/arrow/compute/kernels/aggregate_var_std_internal.h:
##########
@@ -84,14 +84,31 @@ struct Moments {
double Stddev(int ddof) const { return sqrt(Variance(ddof)); }
- double Skew() const {
+ double Skew(bool bias = true) const {
+ double result;
// This may return NaN for m2 == 0 and m3 == 0, which is expected
- return sqrt(count) * m3 / sqrt(m2 * m2 * m2);
+ // or if unbiased and not enough samples (count < 2).
+ if (bias) {
+ result = sqrt(count) * m3 / sqrt(m2 * m2 * m2);
+ } else {
+ result =
+ sqrt(count * (count - 1)) / (count - 2) * (m3 / count) / pow((m2 /
count), 1.5);
Review Comment:
`x**1.5` is `sqrt(x * x * x)`, which may be faster than calling `pow`.
##########
cpp/src/arrow/compute/kernels/aggregate_var_std_internal.h:
##########
@@ -84,14 +84,31 @@ struct Moments {
double Stddev(int ddof) const { return sqrt(Variance(ddof)); }
- double Skew() const {
+ double Skew(bool bias = true) const {
+ double result;
// This may return NaN for m2 == 0 and m3 == 0, which is expected
- return sqrt(count) * m3 / sqrt(m2 * m2 * m2);
+ // or if unbiased and not enough samples (count < 2).
+ if (bias) {
+ result = sqrt(count) * m3 / sqrt(m2 * m2 * m2);
+ } else {
+ result =
+ sqrt(count * (count - 1)) / (count - 2) * (m3 / count) / pow((m2 /
count), 1.5);
+ }
+ return result;
}
- double Kurtosis() const {
+ double Kurtosis(bool bias = true) const {
+ double result;
// This may return NaN for m2 == 0 and m4 == 0, which is expected
- return count * m4 / (m2 * m2) - 3;
+ // or if unbiased and not enough samples (count < 3).
+ if (bias) {
+ result = count * m4 / (m2 * m2) - 3;
+ } else {
+ result = 1.0 / (count - 2) / (count - 3) *
Review Comment:
Let's reduce the number of divisions here?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]