Frank McQuillan created MADLIB-1412:
---------------------------------------
Summary: Summary function has dups for MFV for approximate results
Key: MADLIB-1412
URL: https://issues.apache.org/jira/browse/MADLIB-1412
Project: Apache MADlib
Issue Type: Improvement
Components: Module: Descriptive Statistics
Reporter: Frank McQuillan
Fix For: v1.17
Recently I was using
http://madlib.apache.org/docs/latest/group__grp__summary.html
and noticed some behavior that I had not seen before. E.g., on the
abalone dataset the most frequent values and counts seem to be duplicated.
Starting with this dataset:
```
SELECT * FROM abalone_encoded LIMIT 5;
id | length | diameter | height | whole_weight | shucked_weight |
viscera_weight | shell_weight | rings | age | mature | sex_f | sex_i | sex_m
----+--------+----------+--------+--------------+----------------+----------------+--------------+-------+------+--------+-------+-------+-------
3 | 0.53 | 0.42 | 0.135 | 0.677 | 0.2565 |
0.1415 | 0.21 | 9 | 10.5 | 1 | 1 | 0 | 0
11 | 0.525 | 0.38 | 0.14 | 0.6065 | 0.194 |
0.1475 | 0.21 | 14 | 15.5 | 1 | 1 | 0 | 0
19 | 0.365 | 0.295 | 0.08 | 0.2555 | 0.097 |
0.043 | 0.1 | 7 | 8.5 | 0 | 0 | 0 | 1
27 | 0.58 | 0.45 | 0.185 | 0.9955 | 0.3945 |
0.272 | 0.285 | 11 | 12.5 | 1 | 1 | 0 | 0
35 | 0.705 | 0.55 | 0.2 | 1.7095 | 0.633 |
0.4115 | 0.49 | 13 | 14.5 | 1 | 1 | 0 | 0
(5 rows)
```
(1)
approximate method
```
DROP TABLE IF EXISTS abalone_summary ;
SELECT
madlib.summary (
'abalone_encoded', -- source_table,
'abalone_summary', -- output_table,
NULL, -- target_cols,
NULL -- grouping_cols
)
SELECT * FROM abalone_summary LIMIT 15;
-[ RECORD 1
]--------+------------------------------------------------------------------
group_by |
group_by_value |
target_column | id
column_number | 1
data_type | int4
row_count | 4177
distinct_values | 4177
missing_values | 0
blank_values |
fraction_missing | 0
fraction_blank |
positive_values | 4177
negative_values | 0
zero_values | 0
mean | 2089
variance | 1454292.16666667
confidence_interval | {2052.42791957426,2125.57208042574}
min | 1
max | 4177
first_quartile | 1045
median | 2089
third_quartile | 3133
most_frequent_values | {3453,2468,2270,4117,4152,2274,3811,2364,2174,2737}
mfv_frequencies | {5,5,5,5,4,4,4,4,4,4}
-[ RECORD 2
]--------+------------------------------------------------------------------
group_by |
group_by_value |
target_column | length
column_number | 2
data_type | float8
row_count | 4177
distinct_values | 134
missing_values | 0
blank_values |
fraction_missing | 0
fraction_blank |
positive_values | 4177
negative_values | 0
zero_values | 0
mean | 0.52399209959301
variance | 0.0144223076482969
confidence_interval | {0.520350088942874,0.527634110243145}
min | 0.075
max | 0.815
first_quartile | 0.45
median | 0.545
third_quartile | 0.615
most_frequent_values | {0.55,0.625,0.625,0.55,0.575,0.575,0.58,0.58,0.6,0.62}
mfv_frequencies | {94,94,94,94,93,93,92,92,87,87}
-[ RECORD 3
]--------+------------------------------------------------------------------
group_by |
group_by_value |
target_column | diameter
column_number | 3
data_type | float8
row_count | 4177
distinct_values | 111
missing_values | 0
blank_values |
fraction_missing | 0
fraction_blank |
positive_values | 4177
negative_values | 0
zero_values | 0
mean | 0.407881254488868
variance | 0.00984855103022602
confidence_interval | {0.404871645997761,0.410890862979975}
min | 0.055
max | 0.65
first_quartile | 0.35
median | 0.425
third_quartile | 0.48
most_frequent_values | {0.45,0.45,0.475,0.475,0.4,0.4,0.5,0.5,0.47,0.47}
mfv_frequencies | {139,139,120,120,111,111,110,110,100,100}
-[ RECORD 4
]--------+------------------------------------------------------------------
group_by |
group_by_value |
target_column | height
column_number | 4
data_type | float8
row_count | 4177
distinct_values | 51
missing_values | 0
blank_values |
fraction_missing | 0
fraction_blank |
positive_values | 4175
negative_values | 0
zero_values | 2
mean | 0.139516399329662
variance | 0.00174950266442675
confidence_interval | {0.138247926591563,0.140784872067762}
min | 0
max | 1.13
first_quartile | 0.115
median | 0.14
third_quartile | 0.165
most_frequent_values | {0.15,0.15,0.14,0.14,0.155,0.155,0.175,0.175,0.16,0.16}
mfv_frequencies | {267,267,220,220,217,217,211,211,205,205}
-[ RECORD 5
]--------+------------------------------------------------------------------
group_by |
group_by_value |
target_column | whole_weight
column_number | 5
data_type | float8
row_count | 4177
distinct_values | 2429
missing_values | 0
blank_values |
fraction_missing | 0
fraction_blank |
positive_values | 4177
negative_values | 0
zero_values | 0
mean | 0.828742159444577
variance | 0.240481389201562
confidence_interval | {0.813870324055099,0.843613994834056}
min | 0.002
max | 2.8255
first_quartile | 0.4415
median | 0.7995
third_quartile | 1.153
most_frequent_values |
{1.1345,0.2225,0.196,1.1155,0.97,0.4775,0.494,0.874,1.229,0.6855}
mfv_frequencies | {11,8,8,7,7,7,7,7,7,6}
-[ RECORD 6
]--------+------------------------------------------------------------------
group_by |
group_by_value |
target_column | shucked_weight
column_number | 6
data_type | float8
row_count | 4177
distinct_values | 1515
missing_values | 0
blank_values |
fraction_missing | 0
fraction_blank |
positive_values | 4177
negative_values | 0
zero_values | 0
mean | 0.359367488628203
variance | 0.0492675507435237
confidence_interval | {0.352636105342963,0.366098871913442}
min | 0.001
max | 1.488
first_quartile | 0.186
median | 0.336
third_quartile | 0.502
most_frequent_values |
{0.175,0.2505,0.2025,0.097,0.096,0.302,0.2945,0.25,0.261,0.358}
mfv_frequencies | {11,10,9,9,9,9,9,8,8,8}
-[ RECORD 7
]--------+------------------------------------------------------------------
group_by |
group_by_value |
target_column | viscera_weight
column_number | 7
data_type | float8
row_count | 4177
distinct_values | 880
missing_values | 0
blank_values |
fraction_missing | 0
fraction_blank |
positive_values | 4177
negative_values | 0
zero_values | 0
mean | 0.180593607852526
variance | 0.0120152838599927
confidence_interval | {0.17726937948362,0.183917836221431}
min | 0.0005
max | 0.76
first_quartile | 0.0935
median | 0.171
third_quartile | 0.253
most_frequent_values |
{0.1715,0.196,0.061,0.037,0.2195,0.1905,0.207,0.1405,0.159,0.156}
mfv_frequencies | {15,14,13,13,13,12,12,12,12,12}
-[ RECORD 8
]--------+------------------------------------------------------------------
group_by |
group_by_value |
target_column | shell_weight
column_number | 8
data_type | float8
row_count | 4177
distinct_values | 926
missing_values | 0
blank_values |
fraction_missing | 0
fraction_blank |
positive_values | 4177
negative_values | 0
zero_values | 0
mean | 0.238830859468518
variance | 0.0193773832021586
confidence_interval | {0.234609314715373,0.243052404221663}
min | 0.0015
max | 1.005
first_quartile | 0.13
median | 0.234
third_quartile | 0.329
most_frequent_values |
{0.275,0.275,0.25,0.265,0.315,0.185,0.265,0.315,0.285,0.3}
mfv_frequencies | {43,43,42,40,40,40,40,40,37,37}
-[ RECORD 9
]--------+------------------------------------------------------------------
group_by |
group_by_value |
target_column | rings
column_number | 9
data_type | int4
row_count | 4177
distinct_values | 28
missing_values | 0
blank_values |
fraction_missing | 0
fraction_blank |
positive_values | 4177
negative_values | 0
zero_values | 0
mean | 9.93368446253292
variance | 10.3952659473471
confidence_interval | {9.83590635305212,10.0314625720137}
min | 1
max | 29
first_quartile | 8
median | 9
third_quartile | 11
most_frequent_values | {9,9,10,10,8,8,11,11,7,7}
mfv_frequencies | {689,689,634,634,568,568,487,487,391,391}
-[ RECORD 10
]-------+------------------------------------------------------------------
group_by |
group_by_value |
target_column | age
column_number | 10
data_type | numeric
row_count | 4177
distinct_values | 28
missing_values | 0
blank_values |
fraction_missing | 0
fraction_blank |
positive_values | 4177
negative_values | 0
zero_values | 0
mean | 11.4336844625329
variance | 10.3952659473471
confidence_interval | {11.3359063530521,11.5314625720137}
min | 2.5
max | 30.5
first_quartile | 9.5
median | 10.5
third_quartile | 12.5
most_frequent_values | {10.5,10.5,11.5,11.5,9.5,9.5,12.5,12.5,8.5,8.5}
mfv_frequencies | {689,689,634,634,568,568,487,487,391,391}
-[ RECORD 11
]-------+------------------------------------------------------------------
group_by |
group_by_value |
target_column | mature
column_number | 11
data_type | int4
row_count | 4177
distinct_values | 2
missing_values | 0
blank_values |
fraction_missing | 0
fraction_blank |
positive_values | 2770
negative_values | 0
zero_values | 1407
mean | 0.663155374670816
variance | 0.223433815172854
confidence_interval | {0.648820355293342,0.67749039404829}
min | 0
max | 1
first_quartile | 0
median | 1
third_quartile | 1
most_frequent_values | {1,1,0,0}
mfv_frequencies | {2770,2770,1407,1407}
-[ RECORD 12
]-------+------------------------------------------------------------------
group_by |
group_by_value |
target_column | sex_f
column_number | 12
data_type | int4
row_count | 4177
distinct_values | 2
missing_values | 0
blank_values |
fraction_missing | 0
fraction_blank |
positive_values | 1307
negative_values | 0
zero_values | 2870
mean | 0.31290399808475
variance | 0.21504656956495
confidence_interval | {0.298840605732167,0.326967390437333}
min | 0
max | 1
first_quartile | 0
median | 0
third_quartile | 1
most_frequent_values | {0,0,1,1}
mfv_frequencies | {2870,2870,1307,1307}
-[ RECORD 13
]-------+------------------------------------------------------------------
group_by |
group_by_value |
target_column | sex_i
column_number | 13
data_type | int4
row_count | 4177
distinct_values | 2
missing_values | 0
blank_values |
fraction_missing | 0
fraction_blank |
positive_values | 1342
negative_values | 0
zero_values | 2835
mean | 0.321283217620302
variance | 0.218112529203438
confidence_interval | {0.30711992784858,0.335446507392023}
min | 0
max | 1
first_quartile | 0
median | 0
third_quartile | 1
most_frequent_values | {0,0,1,1}
mfv_frequencies | {2835,2835,1342,1342}
-[ RECORD 14
]-------+------------------------------------------------------------------
group_by |
group_by_value |
target_column | sex_m
column_number | 14
data_type | int4
row_count | 4177
distinct_values | 2
missing_values | 0
blank_values |
fraction_missing | 0
fraction_blank |
positive_values | 1528
negative_values | 0
zero_values | 2649
mean | 0.365812784294949
variance | 0.232049345210086
confidence_interval | {0.351204002328337,0.380421566261561}
min | 0
max | 1
first_quartile | 0
median | 0
third_quartile | 1
most_frequent_values | {0,0,1,1}
mfv_frequencies | {2649,2649,1528,1528}
```
Notice the dups, e.g., for the field `age`
```
most_frequent_values | {10.5,10.5,11.5,11.5,9.5,9.5,12.5,12.5,8.5,8.5}
mfv_frequencies | {689,689,634,634,568,568,487,487,391,391}
```
(2)
exact method
For comparison if you do
```
DROP TABLE IF EXISTS abalone_summary_exact;
SELECT madlib.summary (
'abalone_encoded', -- source_table
'abalone_summary_exact', -- output_table
NULL, -- target_cols
NULL, -- grouping_cols
TRUE, -- get_distinct
TRUE, -- get_quartiles
NULL, -- quantile_array
10, -- how_many_mfv
FALSE, -- get_estimate
15 -- n_cols_per_run
);
```
you get
```
SELECT * FROM abalone_summary_exact;
-[ RECORD 1
]--------+------------------------------------------------------------------
group_by |
group_by_value |
target_column | id
column_number | 1
data_type | int4
row_count | 4177
distinct_values | 4177
missing_values | 0
blank_values |
fraction_missing | 0
fraction_blank |
positive_values | 4177
negative_values | 0
zero_values | 0
mean | 2089
variance | 1454292.16666667
confidence_interval | {2052.42791957426,2125.57208042574}
min | 1
max | 4177
first_quartile | 1045
median | 2089
third_quartile | 3133
most_frequent_values | {4117,4114,2914,3185,3348,3114,3283,3362,3377,3593}
mfv_frequencies | {5,5,4,4,4,4,4,4,4,4}
-[ RECORD 2
]--------+------------------------------------------------------------------
group_by |
group_by_value |
target_column | length
column_number | 2
data_type | float8
row_count | 4177
distinct_values | 134
missing_values | 0
blank_values |
fraction_missing | 0
fraction_blank |
positive_values | 4177
negative_values | 0
zero_values | 0
mean | 0.523992099593009
variance | 0.0144223076482971
confidence_interval | {0.520350088942874,0.527634110243145}
min | 0.075
max | 0.815
first_quartile | 0.45
median | 0.545
third_quartile | 0.615
most_frequent_values | {0.55,0.625,0.575,0.58,0.6,0.62,0.5,0.57,0.63,0.61}
mfv_frequencies | {94,94,93,92,87,87,81,79,78,75}
-[ RECORD 3
]--------+------------------------------------------------------------------
group_by |
group_by_value |
target_column | diameter
column_number | 3
data_type | float8
row_count | 4177
distinct_values | 111
missing_values | 0
blank_values |
fraction_missing | 0
fraction_blank |
positive_values | 4177
negative_values | 0
zero_values | 0
mean | 0.407881254488869
variance | 0.00984855103022455
confidence_interval | {0.404871645997762,0.410890862979976}
min | 0.055
max | 0.65
first_quartile | 0.35
median | 0.425
third_quartile | 0.48
most_frequent_values | {0.45,0.475,0.4,0.5,0.47,0.48,0.455,0.46,0.44,0.485}
mfv_frequencies | {139,120,111,110,100,91,90,89,87,83}
-[ RECORD 4
]--------+------------------------------------------------------------------
group_by |
group_by_value |
target_column | height
column_number | 4
data_type | float8
row_count | 4177
distinct_values | 51
missing_values | 0
blank_values |
fraction_missing | 0
fraction_blank |
positive_values | 4175
negative_values | 0
zero_values | 2
mean | 0.139516399329661
variance | 0.00174950266442686
confidence_interval | {0.138247926591562,0.140784872067761}
min | 0
max | 1.13
first_quartile | 0.115
median | 0.14
third_quartile | 0.165
most_frequent_values | {0.15,0.14,0.155,0.175,0.16,0.125,0.165,0.135,0.145,0.12}
mfv_frequencies | {267,220,217,211,205,202,193,189,182,169}
-[ RECORD 5
]--------+------------------------------------------------------------------
group_by |
group_by_value |
target_column | whole_weight
column_number | 5
data_type | float8
row_count | 4177
distinct_values | 2429
missing_values | 0
blank_values |
fraction_missing | 0
fraction_blank |
positive_values | 4177
negative_values | 0
zero_values | 0
mean | 0.82874215944458
variance | 0.240481389201557
confidence_interval | {0.813870324055102,0.843613994834058}
min | 0.002
max | 2.8255
first_quartile | 0.4415
median | 0.7995
third_quartile | 1.153
most_frequent_values |
{1.1345,0.2225,0.196,0.44,0.872,1.0835,0.4775,0.4425,1.1155,0.97}
mfv_frequencies | {9,8,8,7,7,7,7,7,7,7}
-[ RECORD 6
]--------+------------------------------------------------------------------
group_by |
group_by_value |
target_column | shucked_weight
column_number | 6
data_type | float8
row_count | 4177
distinct_values | 1515
missing_values | 0
blank_values |
fraction_missing | 0
fraction_blank |
positive_values | 4177
negative_values | 0
zero_values | 0
mean | 0.359367488628202
variance | 0.0492675507435239
confidence_interval | {0.352636105342962,0.366098871913441}
min | 0.001
max | 1.488
first_quartile | 0.186
median | 0.336
third_quartile | 0.502
most_frequent_values |
{0.175,0.2505,0.0745,0.2945,0.2025,0.097,0.165,0.302,0.419,0.2}
mfv_frequencies | {11,10,10,9,9,9,9,9,9,9}
-[ RECORD 7
]--------+------------------------------------------------------------------
group_by |
group_by_value |
target_column | viscera_weight
column_number | 7
data_type | float8
row_count | 4177
distinct_values | 880
missing_values | 0
blank_values |
fraction_missing | 0
fraction_blank |
positive_values | 4177
negative_values | 0
zero_values | 0
mean | 0.180593607852526
variance | 0.0120152838599928
confidence_interval | {0.177269379483621,0.183917836221431}
min | 0.0005
max | 0.76
first_quartile | 0.0935
median | 0.171
third_quartile | 0.253
most_frequent_values |
{0.1715,0.196,0.061,0.037,0.2195,0.0575,0.0265,0.096,0.15,0.1905}
mfv_frequencies | {15,14,13,13,13,13,12,12,12,12}
-[ RECORD 8
]--------+------------------------------------------------------------------
group_by |
group_by_value |
target_column | shell_weight
column_number | 8
data_type | float8
row_count | 4177
distinct_values | 926
missing_values | 0
blank_values |
fraction_missing | 0
fraction_blank |
positive_values | 4177
negative_values | 0
zero_values | 0
mean | 0.238830859468518
variance | 0.0193773832021586
confidence_interval | {0.234609314715373,0.243052404221663}
min | 0.0015
max | 1.005
first_quartile | 0.13
median | 0.234
third_quartile | 0.329
most_frequent_values | {0.275,0.25,0.185,0.265,0.315,0.3,0.17,0.285,0.175,0.22}
mfv_frequencies | {43,42,40,40,40,37,37,37,36,36}
-[ RECORD 9
]--------+------------------------------------------------------------------
group_by |
group_by_value |
target_column | rings
column_number | 9
data_type | int4
row_count | 4177
distinct_values | 28
missing_values | 0
blank_values |
fraction_missing | 0
fraction_blank |
positive_values | 4177
negative_values | 0
zero_values | 0
mean | 9.93368446253292
variance | 10.3952659473471
confidence_interval | {9.83590635305212,10.0314625720137}
min | 1
max | 29
first_quartile | 8
median | 9
third_quartile | 11
most_frequent_values | {9,10,8,11,7,12,6,13,14,5}
mfv_frequencies | {689,634,568,487,391,267,259,203,126,115}
-[ RECORD 10
]-------+------------------------------------------------------------------
group_by |
group_by_value |
target_column | age
column_number | 10
data_type | numeric
row_count | 4177
distinct_values | 28
missing_values | 0
blank_values |
fraction_missing | 0
fraction_blank |
positive_values | 4177
negative_values | 0
zero_values | 0
mean | 11.4336844625329
variance | 10.3952659473471
confidence_interval | {11.3359063530521,11.5314625720137}
min | 2.5
max | 30.5
first_quartile | 9.5
median | 10.5
third_quartile | 12.5
most_frequent_values | {10.5,11.5,9.5,12.5,8.5,13.5,7.5,14.5,15.5,6.5}
mfv_frequencies | {689,634,568,487,391,267,259,203,126,115}
-[ RECORD 11
]-------+------------------------------------------------------------------
group_by |
group_by_value |
target_column | mature
column_number | 11
data_type | int4
row_count | 4177
distinct_values | 2
missing_values | 0
blank_values |
fraction_missing | 0
fraction_blank |
positive_values | 2770
negative_values | 0
zero_values | 1407
mean | 0.663155374670816
variance | 0.223433815172854
confidence_interval | {0.648820355293342,0.67749039404829}
min | 0
max | 1
first_quartile | 0
median | 1
third_quartile | 1
most_frequent_values | {1,0}
mfv_frequencies | {2770,1407}
-[ RECORD 12
]-------+------------------------------------------------------------------
group_by |
group_by_value |
target_column | sex_f
column_number | 12
data_type | int4
row_count | 4177
distinct_values | 2
missing_values | 0
blank_values |
fraction_missing | 0
fraction_blank |
positive_values | 1307
negative_values | 0
zero_values | 2870
mean | 0.31290399808475
variance | 0.21504656956495
confidence_interval | {0.298840605732167,0.326967390437333}
min | 0
max | 1
first_quartile | 0
median | 0
third_quartile | 1
most_frequent_values | {0,1}
mfv_frequencies | {2870,1307}
-[ RECORD 13
]-------+------------------------------------------------------------------
group_by |
group_by_value |
target_column | sex_i
column_number | 13
data_type | int4
row_count | 4177
distinct_values | 2
missing_values | 0
blank_values |
fraction_missing | 0
fraction_blank |
positive_values | 1342
negative_values | 0
zero_values | 2835
mean | 0.321283217620302
variance | 0.218112529203438
confidence_interval | {0.30711992784858,0.335446507392023}
min | 0
max | 1
first_quartile | 0
median | 0
third_quartile | 1
most_frequent_values | {0,1}
mfv_frequencies | {2835,1342}
-[ RECORD 14
]-------+------------------------------------------------------------------
group_by |
group_by_value |
target_column | sex_m
column_number | 14
data_type | int4
row_count | 4177
distinct_values | 2
missing_values | 0
blank_values |
fraction_missing | 0
fraction_blank |
positive_values | 1528
negative_values | 0
zero_values | 2649
mean | 0.365812784294949
variance | 0.232049345210086
confidence_interval | {0.351204002328337,0.380421566261561}
min | 0
max | 1
first_quartile | 0
median | 0
third_quartile | 1
most_frequent_values | {0,1}
mfv_frequencies | {2649,1528}
```
which seems OK to me after spot checking some of the counts.
--
This message was sent by Atlassian Jira
(v8.3.4#803005)