[
https://issues.apache.org/jira/browse/MADLIB-1412?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
]
Frank McQuillan updated MADLIB-1412:
------------------------------------
Description:
Recently I was using
http://madlib.apache.org/docs/latest/group__grp__summary.html
and noticed some behavior that I had not seen before. E.g., on the
abalone dataset the most frequent values and counts seem to be duplicated.
Starting with this dataset:
{code}
SELECT * FROM abalone_encoded LIMIT 5;
id | length | diameter | height | whole_weight | shucked_weight |
viscera_weight | shell_weight | rings | age | mature | sex_f | sex_i | sex_m
----+--------+----------+--------+--------------+----------------+----------------+--------------+-------+------+--------+-------+-------+-------
3 | 0.53 | 0.42 | 0.135 | 0.677 | 0.2565 |
0.1415 | 0.21 | 9 | 10.5 | 1 | 1 | 0 | 0
11 | 0.525 | 0.38 | 0.14 | 0.6065 | 0.194 |
0.1475 | 0.21 | 14 | 15.5 | 1 | 1 | 0 | 0
19 | 0.365 | 0.295 | 0.08 | 0.2555 | 0.097 |
0.043 | 0.1 | 7 | 8.5 | 0 | 0 | 0 | 1
27 | 0.58 | 0.45 | 0.185 | 0.9955 | 0.3945 |
0.272 | 0.285 | 11 | 12.5 | 1 | 1 | 0 | 0
35 | 0.705 | 0.55 | 0.2 | 1.7095 | 0.633 |
0.4115 | 0.49 | 13 | 14.5 | 1 | 1 | 0 | 0
(5 rows)
{code}
(1)
approximate method
{code}
DROP TABLE IF EXISTS abalone_summary ;
SELECT
madlib.summary (
'abalone_encoded', -- source_table,
'abalone_summary', -- output_table,
NULL, -- target_cols,
NULL -- grouping_cols
)
SELECT * FROM abalone_summary LIMIT 15;
-[ RECORD 1
]--------+------------------------------------------------------------------
group_by |
group_by_value |
target_column | id
column_number | 1
data_type | int4
row_count | 4177
distinct_values | 4177
missing_values | 0
blank_values |
fraction_missing | 0
fraction_blank |
positive_values | 4177
negative_values | 0
zero_values | 0
mean | 2089
variance | 1454292.16666667
confidence_interval | {2052.42791957426,2125.57208042574}
min | 1
max | 4177
first_quartile | 1045
median | 2089
third_quartile | 3133
most_frequent_values | {3453,2468,2270,4117,4152,2274,3811,2364,2174,2737}
mfv_frequencies | {5,5,5,5,4,4,4,4,4,4}
-[ RECORD 2
]--------+------------------------------------------------------------------
group_by |
group_by_value |
target_column | length
column_number | 2
data_type | float8
row_count | 4177
distinct_values | 134
missing_values | 0
blank_values |
fraction_missing | 0
fraction_blank |
positive_values | 4177
negative_values | 0
zero_values | 0
mean | 0.52399209959301
variance | 0.0144223076482969
confidence_interval | {0.520350088942874,0.527634110243145}
min | 0.075
max | 0.815
first_quartile | 0.45
median | 0.545
third_quartile | 0.615
most_frequent_values | {0.55,0.625,0.625,0.55,0.575,0.575,0.58,0.58,0.6,0.62}
mfv_frequencies | {94,94,94,94,93,93,92,92,87,87}
-[ RECORD 3
]--------+------------------------------------------------------------------
group_by |
group_by_value |
target_column | diameter
column_number | 3
data_type | float8
row_count | 4177
distinct_values | 111
missing_values | 0
blank_values |
fraction_missing | 0
fraction_blank |
positive_values | 4177
negative_values | 0
zero_values | 0
mean | 0.407881254488868
variance | 0.00984855103022602
confidence_interval | {0.404871645997761,0.410890862979975}
min | 0.055
max | 0.65
first_quartile | 0.35
median | 0.425
third_quartile | 0.48
most_frequent_values | {0.45,0.45,0.475,0.475,0.4,0.4,0.5,0.5,0.47,0.47}
mfv_frequencies | {139,139,120,120,111,111,110,110,100,100}
-[ RECORD 4
]--------+------------------------------------------------------------------
group_by |
group_by_value |
target_column | height
column_number | 4
data_type | float8
row_count | 4177
distinct_values | 51
missing_values | 0
blank_values |
fraction_missing | 0
fraction_blank |
positive_values | 4175
negative_values | 0
zero_values | 2
mean | 0.139516399329662
variance | 0.00174950266442675
confidence_interval | {0.138247926591563,0.140784872067762}
min | 0
max | 1.13
first_quartile | 0.115
median | 0.14
third_quartile | 0.165
most_frequent_values | {0.15,0.15,0.14,0.14,0.155,0.155,0.175,0.175,0.16,0.16}
mfv_frequencies | {267,267,220,220,217,217,211,211,205,205}
-[ RECORD 5
]--------+------------------------------------------------------------------
group_by |
group_by_value |
target_column | whole_weight
column_number | 5
data_type | float8
row_count | 4177
distinct_values | 2429
missing_values | 0
blank_values |
fraction_missing | 0
fraction_blank |
positive_values | 4177
negative_values | 0
zero_values | 0
mean | 0.828742159444577
variance | 0.240481389201562
confidence_interval | {0.813870324055099,0.843613994834056}
min | 0.002
max | 2.8255
first_quartile | 0.4415
median | 0.7995
third_quartile | 1.153
most_frequent_values |
{1.1345,0.2225,0.196,1.1155,0.97,0.4775,0.494,0.874,1.229,0.6855}
mfv_frequencies | {11,8,8,7,7,7,7,7,7,6}
-[ RECORD 6
]--------+------------------------------------------------------------------
group_by |
group_by_value |
target_column | shucked_weight
column_number | 6
data_type | float8
row_count | 4177
distinct_values | 1515
missing_values | 0
blank_values |
fraction_missing | 0
fraction_blank |
positive_values | 4177
negative_values | 0
zero_values | 0
mean | 0.359367488628203
variance | 0.0492675507435237
confidence_interval | {0.352636105342963,0.366098871913442}
min | 0.001
max | 1.488
first_quartile | 0.186
median | 0.336
third_quartile | 0.502
most_frequent_values |
{0.175,0.2505,0.2025,0.097,0.096,0.302,0.2945,0.25,0.261,0.358}
mfv_frequencies | {11,10,9,9,9,9,9,8,8,8}
-[ RECORD 7
]--------+------------------------------------------------------------------
group_by |
group_by_value |
target_column | viscera_weight
column_number | 7
data_type | float8
row_count | 4177
distinct_values | 880
missing_values | 0
blank_values |
fraction_missing | 0
fraction_blank |
positive_values | 4177
negative_values | 0
zero_values | 0
mean | 0.180593607852526
variance | 0.0120152838599927
confidence_interval | {0.17726937948362,0.183917836221431}
min | 0.0005
max | 0.76
first_quartile | 0.0935
median | 0.171
third_quartile | 0.253
most_frequent_values |
{0.1715,0.196,0.061,0.037,0.2195,0.1905,0.207,0.1405,0.159,0.156}
mfv_frequencies | {15,14,13,13,13,12,12,12,12,12}
-[ RECORD 8
]--------+------------------------------------------------------------------
group_by |
group_by_value |
target_column | shell_weight
column_number | 8
data_type | float8
row_count | 4177
distinct_values | 926
missing_values | 0
blank_values |
fraction_missing | 0
fraction_blank |
positive_values | 4177
negative_values | 0
zero_values | 0
mean | 0.238830859468518
variance | 0.0193773832021586
confidence_interval | {0.234609314715373,0.243052404221663}
min | 0.0015
max | 1.005
first_quartile | 0.13
median | 0.234
third_quartile | 0.329
most_frequent_values |
{0.275,0.275,0.25,0.265,0.315,0.185,0.265,0.315,0.285,0.3}
mfv_frequencies | {43,43,42,40,40,40,40,40,37,37}
-[ RECORD 9
]--------+------------------------------------------------------------------
group_by |
group_by_value |
target_column | rings
column_number | 9
data_type | int4
row_count | 4177
distinct_values | 28
missing_values | 0
blank_values |
fraction_missing | 0
fraction_blank |
positive_values | 4177
negative_values | 0
zero_values | 0
mean | 9.93368446253292
variance | 10.3952659473471
confidence_interval | {9.83590635305212,10.0314625720137}
min | 1
max | 29
first_quartile | 8
median | 9
third_quartile | 11
most_frequent_values | {9,9,10,10,8,8,11,11,7,7}
mfv_frequencies | {689,689,634,634,568,568,487,487,391,391}
-[ RECORD 10
]-------+------------------------------------------------------------------
group_by |
group_by_value |
target_column | age
column_number | 10
data_type | numeric
row_count | 4177
distinct_values | 28
missing_values | 0
blank_values |
fraction_missing | 0
fraction_blank |
positive_values | 4177
negative_values | 0
zero_values | 0
mean | 11.4336844625329
variance | 10.3952659473471
confidence_interval | {11.3359063530521,11.5314625720137}
min | 2.5
max | 30.5
first_quartile | 9.5
median | 10.5
third_quartile | 12.5
most_frequent_values | {10.5,10.5,11.5,11.5,9.5,9.5,12.5,12.5,8.5,8.5}
mfv_frequencies | {689,689,634,634,568,568,487,487,391,391}
-[ RECORD 11
]-------+------------------------------------------------------------------
group_by |
group_by_value |
target_column | mature
column_number | 11
data_type | int4
row_count | 4177
distinct_values | 2
missing_values | 0
blank_values |
fraction_missing | 0
fraction_blank |
positive_values | 2770
negative_values | 0
zero_values | 1407
mean | 0.663155374670816
variance | 0.223433815172854
confidence_interval | {0.648820355293342,0.67749039404829}
min | 0
max | 1
first_quartile | 0
median | 1
third_quartile | 1
most_frequent_values | {1,1,0,0}
mfv_frequencies | {2770,2770,1407,1407}
-[ RECORD 12
]-------+------------------------------------------------------------------
group_by |
group_by_value |
target_column | sex_f
column_number | 12
data_type | int4
row_count | 4177
distinct_values | 2
missing_values | 0
blank_values |
fraction_missing | 0
fraction_blank |
positive_values | 1307
negative_values | 0
zero_values | 2870
mean | 0.31290399808475
variance | 0.21504656956495
confidence_interval | {0.298840605732167,0.326967390437333}
min | 0
max | 1
first_quartile | 0
median | 0
third_quartile | 1
most_frequent_values | {0,0,1,1}
mfv_frequencies | {2870,2870,1307,1307}
-[ RECORD 13
]-------+------------------------------------------------------------------
group_by |
group_by_value |
target_column | sex_i
column_number | 13
data_type | int4
row_count | 4177
distinct_values | 2
missing_values | 0
blank_values |
fraction_missing | 0
fraction_blank |
positive_values | 1342
negative_values | 0
zero_values | 2835
mean | 0.321283217620302
variance | 0.218112529203438
confidence_interval | {0.30711992784858,0.335446507392023}
min | 0
max | 1
first_quartile | 0
median | 0
third_quartile | 1
most_frequent_values | {0,0,1,1}
mfv_frequencies | {2835,2835,1342,1342}
-[ RECORD 14
]-------+------------------------------------------------------------------
group_by |
group_by_value |
target_column | sex_m
column_number | 14
data_type | int4
row_count | 4177
distinct_values | 2
missing_values | 0
blank_values |
fraction_missing | 0
fraction_blank |
positive_values | 1528
negative_values | 0
zero_values | 2649
mean | 0.365812784294949
variance | 0.232049345210086
confidence_interval | {0.351204002328337,0.380421566261561}
min | 0
max | 1
first_quartile | 0
median | 0
third_quartile | 1
most_frequent_values | {0,0,1,1}
mfv_frequencies | {2649,2649,1528,1528}
{code}
Notice the dups, e.g., for the field `age`
{code}
most_frequent_values | {10.5,10.5,11.5,11.5,9.5,9.5,12.5,12.5,8.5,8.5}
mfv_frequencies | {689,689,634,634,568,568,487,487,391,391}
{code}
(2)
exact method
For comparison if you do
{code}
DROP TABLE IF EXISTS abalone_summary_exact;
SELECT madlib.summary (
'abalone_encoded', -- source_table
'abalone_summary_exact', -- output_table
NULL, -- target_cols
NULL, -- grouping_cols
TRUE, -- get_distinct
TRUE, -- get_quartiles
NULL, -- quantile_array
10, -- how_many_mfv
FALSE, -- get_estimate
15 -- n_cols_per_run
);
{code}
you get
{code}
SELECT * FROM abalone_summary_exact;
-[ RECORD 1
]--------+------------------------------------------------------------------
group_by |
group_by_value |
target_column | id
column_number | 1
data_type | int4
row_count | 4177
distinct_values | 4177
missing_values | 0
blank_values |
fraction_missing | 0
fraction_blank |
positive_values | 4177
negative_values | 0
zero_values | 0
mean | 2089
variance | 1454292.16666667
confidence_interval | {2052.42791957426,2125.57208042574}
min | 1
max | 4177
first_quartile | 1045
median | 2089
third_quartile | 3133
most_frequent_values | {4117,4114,2914,3185,3348,3114,3283,3362,3377,3593}
mfv_frequencies | {5,5,4,4,4,4,4,4,4,4}
-[ RECORD 2
]--------+------------------------------------------------------------------
group_by |
group_by_value |
target_column | length
column_number | 2
data_type | float8
row_count | 4177
distinct_values | 134
missing_values | 0
blank_values |
fraction_missing | 0
fraction_blank |
positive_values | 4177
negative_values | 0
zero_values | 0
mean | 0.523992099593009
variance | 0.0144223076482971
confidence_interval | {0.520350088942874,0.527634110243145}
min | 0.075
max | 0.815
first_quartile | 0.45
median | 0.545
third_quartile | 0.615
most_frequent_values | {0.55,0.625,0.575,0.58,0.6,0.62,0.5,0.57,0.63,0.61}
mfv_frequencies | {94,94,93,92,87,87,81,79,78,75}
-[ RECORD 3
]--------+------------------------------------------------------------------
group_by |
group_by_value |
target_column | diameter
column_number | 3
data_type | float8
row_count | 4177
distinct_values | 111
missing_values | 0
blank_values |
fraction_missing | 0
fraction_blank |
positive_values | 4177
negative_values | 0
zero_values | 0
mean | 0.407881254488869
variance | 0.00984855103022455
confidence_interval | {0.404871645997762,0.410890862979976}
min | 0.055
max | 0.65
first_quartile | 0.35
median | 0.425
third_quartile | 0.48
most_frequent_values | {0.45,0.475,0.4,0.5,0.47,0.48,0.455,0.46,0.44,0.485}
mfv_frequencies | {139,120,111,110,100,91,90,89,87,83}
-[ RECORD 4
]--------+------------------------------------------------------------------
group_by |
group_by_value |
target_column | height
column_number | 4
data_type | float8
row_count | 4177
distinct_values | 51
missing_values | 0
blank_values |
fraction_missing | 0
fraction_blank |
positive_values | 4175
negative_values | 0
zero_values | 2
mean | 0.139516399329661
variance | 0.00174950266442686
confidence_interval | {0.138247926591562,0.140784872067761}
min | 0
max | 1.13
first_quartile | 0.115
median | 0.14
third_quartile | 0.165
most_frequent_values | {0.15,0.14,0.155,0.175,0.16,0.125,0.165,0.135,0.145,0.12}
mfv_frequencies | {267,220,217,211,205,202,193,189,182,169}
-[ RECORD 5
]--------+------------------------------------------------------------------
group_by |
group_by_value |
target_column | whole_weight
column_number | 5
data_type | float8
row_count | 4177
distinct_values | 2429
missing_values | 0
blank_values |
fraction_missing | 0
fraction_blank |
positive_values | 4177
negative_values | 0
zero_values | 0
mean | 0.82874215944458
variance | 0.240481389201557
confidence_interval | {0.813870324055102,0.843613994834058}
min | 0.002
max | 2.8255
first_quartile | 0.4415
median | 0.7995
third_quartile | 1.153
most_frequent_values |
{1.1345,0.2225,0.196,0.44,0.872,1.0835,0.4775,0.4425,1.1155,0.97}
mfv_frequencies | {9,8,8,7,7,7,7,7,7,7}
-[ RECORD 6
]--------+------------------------------------------------------------------
group_by |
group_by_value |
target_column | shucked_weight
column_number | 6
data_type | float8
row_count | 4177
distinct_values | 1515
missing_values | 0
blank_values |
fraction_missing | 0
fraction_blank |
positive_values | 4177
negative_values | 0
zero_values | 0
mean | 0.359367488628202
variance | 0.0492675507435239
confidence_interval | {0.352636105342962,0.366098871913441}
min | 0.001
max | 1.488
first_quartile | 0.186
median | 0.336
third_quartile | 0.502
most_frequent_values |
{0.175,0.2505,0.0745,0.2945,0.2025,0.097,0.165,0.302,0.419,0.2}
mfv_frequencies | {11,10,10,9,9,9,9,9,9,9}
-[ RECORD 7
]--------+------------------------------------------------------------------
group_by |
group_by_value |
target_column | viscera_weight
column_number | 7
data_type | float8
row_count | 4177
distinct_values | 880
missing_values | 0
blank_values |
fraction_missing | 0
fraction_blank |
positive_values | 4177
negative_values | 0
zero_values | 0
mean | 0.180593607852526
variance | 0.0120152838599928
confidence_interval | {0.177269379483621,0.183917836221431}
min | 0.0005
max | 0.76
first_quartile | 0.0935
median | 0.171
third_quartile | 0.253
most_frequent_values |
{0.1715,0.196,0.061,0.037,0.2195,0.0575,0.0265,0.096,0.15,0.1905}
mfv_frequencies | {15,14,13,13,13,13,12,12,12,12}
-[ RECORD 8
]--------+------------------------------------------------------------------
group_by |
group_by_value |
target_column | shell_weight
column_number | 8
data_type | float8
row_count | 4177
distinct_values | 926
missing_values | 0
blank_values |
fraction_missing | 0
fraction_blank |
positive_values | 4177
negative_values | 0
zero_values | 0
mean | 0.238830859468518
variance | 0.0193773832021586
confidence_interval | {0.234609314715373,0.243052404221663}
min | 0.0015
max | 1.005
first_quartile | 0.13
median | 0.234
third_quartile | 0.329
most_frequent_values | {0.275,0.25,0.185,0.265,0.315,0.3,0.17,0.285,0.175,0.22}
mfv_frequencies | {43,42,40,40,40,37,37,37,36,36}
-[ RECORD 9
]--------+------------------------------------------------------------------
group_by |
group_by_value |
target_column | rings
column_number | 9
data_type | int4
row_count | 4177
distinct_values | 28
missing_values | 0
blank_values |
fraction_missing | 0
fraction_blank |
positive_values | 4177
negative_values | 0
zero_values | 0
mean | 9.93368446253292
variance | 10.3952659473471
confidence_interval | {9.83590635305212,10.0314625720137}
min | 1
max | 29
first_quartile | 8
median | 9
third_quartile | 11
most_frequent_values | {9,10,8,11,7,12,6,13,14,5}
mfv_frequencies | {689,634,568,487,391,267,259,203,126,115}
-[ RECORD 10
]-------+------------------------------------------------------------------
group_by |
group_by_value |
target_column | age
column_number | 10
data_type | numeric
row_count | 4177
distinct_values | 28
missing_values | 0
blank_values |
fraction_missing | 0
fraction_blank |
positive_values | 4177
negative_values | 0
zero_values | 0
mean | 11.4336844625329
variance | 10.3952659473471
confidence_interval | {11.3359063530521,11.5314625720137}
min | 2.5
max | 30.5
first_quartile | 9.5
median | 10.5
third_quartile | 12.5
most_frequent_values | {10.5,11.5,9.5,12.5,8.5,13.5,7.5,14.5,15.5,6.5}
mfv_frequencies | {689,634,568,487,391,267,259,203,126,115}
-[ RECORD 11
]-------+------------------------------------------------------------------
group_by |
group_by_value |
target_column | mature
column_number | 11
data_type | int4
row_count | 4177
distinct_values | 2
missing_values | 0
blank_values |
fraction_missing | 0
fraction_blank |
positive_values | 2770
negative_values | 0
zero_values | 1407
mean | 0.663155374670816
variance | 0.223433815172854
confidence_interval | {0.648820355293342,0.67749039404829}
min | 0
max | 1
first_quartile | 0
median | 1
third_quartile | 1
most_frequent_values | {1,0}
mfv_frequencies | {2770,1407}
-[ RECORD 12
]-------+------------------------------------------------------------------
group_by |
group_by_value |
target_column | sex_f
column_number | 12
data_type | int4
row_count | 4177
distinct_values | 2
missing_values | 0
blank_values |
fraction_missing | 0
fraction_blank |
positive_values | 1307
negative_values | 0
zero_values | 2870
mean | 0.31290399808475
variance | 0.21504656956495
confidence_interval | {0.298840605732167,0.326967390437333}
min | 0
max | 1
first_quartile | 0
median | 0
third_quartile | 1
most_frequent_values | {0,1}
mfv_frequencies | {2870,1307}
-[ RECORD 13
]-------+------------------------------------------------------------------
group_by |
group_by_value |
target_column | sex_i
column_number | 13
data_type | int4
row_count | 4177
distinct_values | 2
missing_values | 0
blank_values |
fraction_missing | 0
fraction_blank |
positive_values | 1342
negative_values | 0
zero_values | 2835
mean | 0.321283217620302
variance | 0.218112529203438
confidence_interval | {0.30711992784858,0.335446507392023}
min | 0
max | 1
first_quartile | 0
median | 0
third_quartile | 1
most_frequent_values | {0,1}
mfv_frequencies | {2835,1342}
-[ RECORD 14
]-------+------------------------------------------------------------------
group_by |
group_by_value |
target_column | sex_m
column_number | 14
data_type | int4
row_count | 4177
distinct_values | 2
missing_values | 0
blank_values |
fraction_missing | 0
fraction_blank |
positive_values | 1528
negative_values | 0
zero_values | 2649
mean | 0.365812784294949
variance | 0.232049345210086
confidence_interval | {0.351204002328337,0.380421566261561}
min | 0
max | 1
first_quartile | 0
median | 0
third_quartile | 1
most_frequent_values | {0,1}
mfv_frequencies | {2649,1528}
{code}
which seems OK to me after spot checking some of the counts.
was:
Recently I was using
http://madlib.apache.org/docs/latest/group__grp__summary.html
and noticed some behavior that I had not seen before. E.g., on the
abalone dataset the most frequent values and counts seem to be duplicated.
Starting with this dataset:
```
SELECT * FROM abalone_encoded LIMIT 5;
id | length | diameter | height | whole_weight | shucked_weight |
viscera_weight | shell_weight | rings | age | mature | sex_f | sex_i | sex_m
----+--------+----------+--------+--------------+----------------+----------------+--------------+-------+------+--------+-------+-------+-------
3 | 0.53 | 0.42 | 0.135 | 0.677 | 0.2565 |
0.1415 | 0.21 | 9 | 10.5 | 1 | 1 | 0 | 0
11 | 0.525 | 0.38 | 0.14 | 0.6065 | 0.194 |
0.1475 | 0.21 | 14 | 15.5 | 1 | 1 | 0 | 0
19 | 0.365 | 0.295 | 0.08 | 0.2555 | 0.097 |
0.043 | 0.1 | 7 | 8.5 | 0 | 0 | 0 | 1
27 | 0.58 | 0.45 | 0.185 | 0.9955 | 0.3945 |
0.272 | 0.285 | 11 | 12.5 | 1 | 1 | 0 | 0
35 | 0.705 | 0.55 | 0.2 | 1.7095 | 0.633 |
0.4115 | 0.49 | 13 | 14.5 | 1 | 1 | 0 | 0
(5 rows)
```
(1)
approximate method
```
DROP TABLE IF EXISTS abalone_summary ;
SELECT
madlib.summary (
'abalone_encoded', -- source_table,
'abalone_summary', -- output_table,
NULL, -- target_cols,
NULL -- grouping_cols
)
SELECT * FROM abalone_summary LIMIT 15;
-[ RECORD 1
]--------+------------------------------------------------------------------
group_by |
group_by_value |
target_column | id
column_number | 1
data_type | int4
row_count | 4177
distinct_values | 4177
missing_values | 0
blank_values |
fraction_missing | 0
fraction_blank |
positive_values | 4177
negative_values | 0
zero_values | 0
mean | 2089
variance | 1454292.16666667
confidence_interval | {2052.42791957426,2125.57208042574}
min | 1
max | 4177
first_quartile | 1045
median | 2089
third_quartile | 3133
most_frequent_values | {3453,2468,2270,4117,4152,2274,3811,2364,2174,2737}
mfv_frequencies | {5,5,5,5,4,4,4,4,4,4}
-[ RECORD 2
]--------+------------------------------------------------------------------
group_by |
group_by_value |
target_column | length
column_number | 2
data_type | float8
row_count | 4177
distinct_values | 134
missing_values | 0
blank_values |
fraction_missing | 0
fraction_blank |
positive_values | 4177
negative_values | 0
zero_values | 0
mean | 0.52399209959301
variance | 0.0144223076482969
confidence_interval | {0.520350088942874,0.527634110243145}
min | 0.075
max | 0.815
first_quartile | 0.45
median | 0.545
third_quartile | 0.615
most_frequent_values | {0.55,0.625,0.625,0.55,0.575,0.575,0.58,0.58,0.6,0.62}
mfv_frequencies | {94,94,94,94,93,93,92,92,87,87}
-[ RECORD 3
]--------+------------------------------------------------------------------
group_by |
group_by_value |
target_column | diameter
column_number | 3
data_type | float8
row_count | 4177
distinct_values | 111
missing_values | 0
blank_values |
fraction_missing | 0
fraction_blank |
positive_values | 4177
negative_values | 0
zero_values | 0
mean | 0.407881254488868
variance | 0.00984855103022602
confidence_interval | {0.404871645997761,0.410890862979975}
min | 0.055
max | 0.65
first_quartile | 0.35
median | 0.425
third_quartile | 0.48
most_frequent_values | {0.45,0.45,0.475,0.475,0.4,0.4,0.5,0.5,0.47,0.47}
mfv_frequencies | {139,139,120,120,111,111,110,110,100,100}
-[ RECORD 4
]--------+------------------------------------------------------------------
group_by |
group_by_value |
target_column | height
column_number | 4
data_type | float8
row_count | 4177
distinct_values | 51
missing_values | 0
blank_values |
fraction_missing | 0
fraction_blank |
positive_values | 4175
negative_values | 0
zero_values | 2
mean | 0.139516399329662
variance | 0.00174950266442675
confidence_interval | {0.138247926591563,0.140784872067762}
min | 0
max | 1.13
first_quartile | 0.115
median | 0.14
third_quartile | 0.165
most_frequent_values | {0.15,0.15,0.14,0.14,0.155,0.155,0.175,0.175,0.16,0.16}
mfv_frequencies | {267,267,220,220,217,217,211,211,205,205}
-[ RECORD 5
]--------+------------------------------------------------------------------
group_by |
group_by_value |
target_column | whole_weight
column_number | 5
data_type | float8
row_count | 4177
distinct_values | 2429
missing_values | 0
blank_values |
fraction_missing | 0
fraction_blank |
positive_values | 4177
negative_values | 0
zero_values | 0
mean | 0.828742159444577
variance | 0.240481389201562
confidence_interval | {0.813870324055099,0.843613994834056}
min | 0.002
max | 2.8255
first_quartile | 0.4415
median | 0.7995
third_quartile | 1.153
most_frequent_values |
{1.1345,0.2225,0.196,1.1155,0.97,0.4775,0.494,0.874,1.229,0.6855}
mfv_frequencies | {11,8,8,7,7,7,7,7,7,6}
-[ RECORD 6
]--------+------------------------------------------------------------------
group_by |
group_by_value |
target_column | shucked_weight
column_number | 6
data_type | float8
row_count | 4177
distinct_values | 1515
missing_values | 0
blank_values |
fraction_missing | 0
fraction_blank |
positive_values | 4177
negative_values | 0
zero_values | 0
mean | 0.359367488628203
variance | 0.0492675507435237
confidence_interval | {0.352636105342963,0.366098871913442}
min | 0.001
max | 1.488
first_quartile | 0.186
median | 0.336
third_quartile | 0.502
most_frequent_values |
{0.175,0.2505,0.2025,0.097,0.096,0.302,0.2945,0.25,0.261,0.358}
mfv_frequencies | {11,10,9,9,9,9,9,8,8,8}
-[ RECORD 7
]--------+------------------------------------------------------------------
group_by |
group_by_value |
target_column | viscera_weight
column_number | 7
data_type | float8
row_count | 4177
distinct_values | 880
missing_values | 0
blank_values |
fraction_missing | 0
fraction_blank |
positive_values | 4177
negative_values | 0
zero_values | 0
mean | 0.180593607852526
variance | 0.0120152838599927
confidence_interval | {0.17726937948362,0.183917836221431}
min | 0.0005
max | 0.76
first_quartile | 0.0935
median | 0.171
third_quartile | 0.253
most_frequent_values |
{0.1715,0.196,0.061,0.037,0.2195,0.1905,0.207,0.1405,0.159,0.156}
mfv_frequencies | {15,14,13,13,13,12,12,12,12,12}
-[ RECORD 8
]--------+------------------------------------------------------------------
group_by |
group_by_value |
target_column | shell_weight
column_number | 8
data_type | float8
row_count | 4177
distinct_values | 926
missing_values | 0
blank_values |
fraction_missing | 0
fraction_blank |
positive_values | 4177
negative_values | 0
zero_values | 0
mean | 0.238830859468518
variance | 0.0193773832021586
confidence_interval | {0.234609314715373,0.243052404221663}
min | 0.0015
max | 1.005
first_quartile | 0.13
median | 0.234
third_quartile | 0.329
most_frequent_values |
{0.275,0.275,0.25,0.265,0.315,0.185,0.265,0.315,0.285,0.3}
mfv_frequencies | {43,43,42,40,40,40,40,40,37,37}
-[ RECORD 9
]--------+------------------------------------------------------------------
group_by |
group_by_value |
target_column | rings
column_number | 9
data_type | int4
row_count | 4177
distinct_values | 28
missing_values | 0
blank_values |
fraction_missing | 0
fraction_blank |
positive_values | 4177
negative_values | 0
zero_values | 0
mean | 9.93368446253292
variance | 10.3952659473471
confidence_interval | {9.83590635305212,10.0314625720137}
min | 1
max | 29
first_quartile | 8
median | 9
third_quartile | 11
most_frequent_values | {9,9,10,10,8,8,11,11,7,7}
mfv_frequencies | {689,689,634,634,568,568,487,487,391,391}
-[ RECORD 10
]-------+------------------------------------------------------------------
group_by |
group_by_value |
target_column | age
column_number | 10
data_type | numeric
row_count | 4177
distinct_values | 28
missing_values | 0
blank_values |
fraction_missing | 0
fraction_blank |
positive_values | 4177
negative_values | 0
zero_values | 0
mean | 11.4336844625329
variance | 10.3952659473471
confidence_interval | {11.3359063530521,11.5314625720137}
min | 2.5
max | 30.5
first_quartile | 9.5
median | 10.5
third_quartile | 12.5
most_frequent_values | {10.5,10.5,11.5,11.5,9.5,9.5,12.5,12.5,8.5,8.5}
mfv_frequencies | {689,689,634,634,568,568,487,487,391,391}
-[ RECORD 11
]-------+------------------------------------------------------------------
group_by |
group_by_value |
target_column | mature
column_number | 11
data_type | int4
row_count | 4177
distinct_values | 2
missing_values | 0
blank_values |
fraction_missing | 0
fraction_blank |
positive_values | 2770
negative_values | 0
zero_values | 1407
mean | 0.663155374670816
variance | 0.223433815172854
confidence_interval | {0.648820355293342,0.67749039404829}
min | 0
max | 1
first_quartile | 0
median | 1
third_quartile | 1
most_frequent_values | {1,1,0,0}
mfv_frequencies | {2770,2770,1407,1407}
-[ RECORD 12
]-------+------------------------------------------------------------------
group_by |
group_by_value |
target_column | sex_f
column_number | 12
data_type | int4
row_count | 4177
distinct_values | 2
missing_values | 0
blank_values |
fraction_missing | 0
fraction_blank |
positive_values | 1307
negative_values | 0
zero_values | 2870
mean | 0.31290399808475
variance | 0.21504656956495
confidence_interval | {0.298840605732167,0.326967390437333}
min | 0
max | 1
first_quartile | 0
median | 0
third_quartile | 1
most_frequent_values | {0,0,1,1}
mfv_frequencies | {2870,2870,1307,1307}
-[ RECORD 13
]-------+------------------------------------------------------------------
group_by |
group_by_value |
target_column | sex_i
column_number | 13
data_type | int4
row_count | 4177
distinct_values | 2
missing_values | 0
blank_values |
fraction_missing | 0
fraction_blank |
positive_values | 1342
negative_values | 0
zero_values | 2835
mean | 0.321283217620302
variance | 0.218112529203438
confidence_interval | {0.30711992784858,0.335446507392023}
min | 0
max | 1
first_quartile | 0
median | 0
third_quartile | 1
most_frequent_values | {0,0,1,1}
mfv_frequencies | {2835,2835,1342,1342}
-[ RECORD 14
]-------+------------------------------------------------------------------
group_by |
group_by_value |
target_column | sex_m
column_number | 14
data_type | int4
row_count | 4177
distinct_values | 2
missing_values | 0
blank_values |
fraction_missing | 0
fraction_blank |
positive_values | 1528
negative_values | 0
zero_values | 2649
mean | 0.365812784294949
variance | 0.232049345210086
confidence_interval | {0.351204002328337,0.380421566261561}
min | 0
max | 1
first_quartile | 0
median | 0
third_quartile | 1
most_frequent_values | {0,0,1,1}
mfv_frequencies | {2649,2649,1528,1528}
```
Notice the dups, e.g., for the field `age`
```
most_frequent_values | {10.5,10.5,11.5,11.5,9.5,9.5,12.5,12.5,8.5,8.5}
mfv_frequencies | {689,689,634,634,568,568,487,487,391,391}
```
(2)
exact method
For comparison if you do
```
DROP TABLE IF EXISTS abalone_summary_exact;
SELECT madlib.summary (
'abalone_encoded', -- source_table
'abalone_summary_exact', -- output_table
NULL, -- target_cols
NULL, -- grouping_cols
TRUE, -- get_distinct
TRUE, -- get_quartiles
NULL, -- quantile_array
10, -- how_many_mfv
FALSE, -- get_estimate
15 -- n_cols_per_run
);
```
you get
```
SELECT * FROM abalone_summary_exact;
-[ RECORD 1
]--------+------------------------------------------------------------------
group_by |
group_by_value |
target_column | id
column_number | 1
data_type | int4
row_count | 4177
distinct_values | 4177
missing_values | 0
blank_values |
fraction_missing | 0
fraction_blank |
positive_values | 4177
negative_values | 0
zero_values | 0
mean | 2089
variance | 1454292.16666667
confidence_interval | {2052.42791957426,2125.57208042574}
min | 1
max | 4177
first_quartile | 1045
median | 2089
third_quartile | 3133
most_frequent_values | {4117,4114,2914,3185,3348,3114,3283,3362,3377,3593}
mfv_frequencies | {5,5,4,4,4,4,4,4,4,4}
-[ RECORD 2
]--------+------------------------------------------------------------------
group_by |
group_by_value |
target_column | length
column_number | 2
data_type | float8
row_count | 4177
distinct_values | 134
missing_values | 0
blank_values |
fraction_missing | 0
fraction_blank |
positive_values | 4177
negative_values | 0
zero_values | 0
mean | 0.523992099593009
variance | 0.0144223076482971
confidence_interval | {0.520350088942874,0.527634110243145}
min | 0.075
max | 0.815
first_quartile | 0.45
median | 0.545
third_quartile | 0.615
most_frequent_values | {0.55,0.625,0.575,0.58,0.6,0.62,0.5,0.57,0.63,0.61}
mfv_frequencies | {94,94,93,92,87,87,81,79,78,75}
-[ RECORD 3
]--------+------------------------------------------------------------------
group_by |
group_by_value |
target_column | diameter
column_number | 3
data_type | float8
row_count | 4177
distinct_values | 111
missing_values | 0
blank_values |
fraction_missing | 0
fraction_blank |
positive_values | 4177
negative_values | 0
zero_values | 0
mean | 0.407881254488869
variance | 0.00984855103022455
confidence_interval | {0.404871645997762,0.410890862979976}
min | 0.055
max | 0.65
first_quartile | 0.35
median | 0.425
third_quartile | 0.48
most_frequent_values | {0.45,0.475,0.4,0.5,0.47,0.48,0.455,0.46,0.44,0.485}
mfv_frequencies | {139,120,111,110,100,91,90,89,87,83}
-[ RECORD 4
]--------+------------------------------------------------------------------
group_by |
group_by_value |
target_column | height
column_number | 4
data_type | float8
row_count | 4177
distinct_values | 51
missing_values | 0
blank_values |
fraction_missing | 0
fraction_blank |
positive_values | 4175
negative_values | 0
zero_values | 2
mean | 0.139516399329661
variance | 0.00174950266442686
confidence_interval | {0.138247926591562,0.140784872067761}
min | 0
max | 1.13
first_quartile | 0.115
median | 0.14
third_quartile | 0.165
most_frequent_values | {0.15,0.14,0.155,0.175,0.16,0.125,0.165,0.135,0.145,0.12}
mfv_frequencies | {267,220,217,211,205,202,193,189,182,169}
-[ RECORD 5
]--------+------------------------------------------------------------------
group_by |
group_by_value |
target_column | whole_weight
column_number | 5
data_type | float8
row_count | 4177
distinct_values | 2429
missing_values | 0
blank_values |
fraction_missing | 0
fraction_blank |
positive_values | 4177
negative_values | 0
zero_values | 0
mean | 0.82874215944458
variance | 0.240481389201557
confidence_interval | {0.813870324055102,0.843613994834058}
min | 0.002
max | 2.8255
first_quartile | 0.4415
median | 0.7995
third_quartile | 1.153
most_frequent_values |
{1.1345,0.2225,0.196,0.44,0.872,1.0835,0.4775,0.4425,1.1155,0.97}
mfv_frequencies | {9,8,8,7,7,7,7,7,7,7}
-[ RECORD 6
]--------+------------------------------------------------------------------
group_by |
group_by_value |
target_column | shucked_weight
column_number | 6
data_type | float8
row_count | 4177
distinct_values | 1515
missing_values | 0
blank_values |
fraction_missing | 0
fraction_blank |
positive_values | 4177
negative_values | 0
zero_values | 0
mean | 0.359367488628202
variance | 0.0492675507435239
confidence_interval | {0.352636105342962,0.366098871913441}
min | 0.001
max | 1.488
first_quartile | 0.186
median | 0.336
third_quartile | 0.502
most_frequent_values |
{0.175,0.2505,0.0745,0.2945,0.2025,0.097,0.165,0.302,0.419,0.2}
mfv_frequencies | {11,10,10,9,9,9,9,9,9,9}
-[ RECORD 7
]--------+------------------------------------------------------------------
group_by |
group_by_value |
target_column | viscera_weight
column_number | 7
data_type | float8
row_count | 4177
distinct_values | 880
missing_values | 0
blank_values |
fraction_missing | 0
fraction_blank |
positive_values | 4177
negative_values | 0
zero_values | 0
mean | 0.180593607852526
variance | 0.0120152838599928
confidence_interval | {0.177269379483621,0.183917836221431}
min | 0.0005
max | 0.76
first_quartile | 0.0935
median | 0.171
third_quartile | 0.253
most_frequent_values |
{0.1715,0.196,0.061,0.037,0.2195,0.0575,0.0265,0.096,0.15,0.1905}
mfv_frequencies | {15,14,13,13,13,13,12,12,12,12}
-[ RECORD 8
]--------+------------------------------------------------------------------
group_by |
group_by_value |
target_column | shell_weight
column_number | 8
data_type | float8
row_count | 4177
distinct_values | 926
missing_values | 0
blank_values |
fraction_missing | 0
fraction_blank |
positive_values | 4177
negative_values | 0
zero_values | 0
mean | 0.238830859468518
variance | 0.0193773832021586
confidence_interval | {0.234609314715373,0.243052404221663}
min | 0.0015
max | 1.005
first_quartile | 0.13
median | 0.234
third_quartile | 0.329
most_frequent_values | {0.275,0.25,0.185,0.265,0.315,0.3,0.17,0.285,0.175,0.22}
mfv_frequencies | {43,42,40,40,40,37,37,37,36,36}
-[ RECORD 9
]--------+------------------------------------------------------------------
group_by |
group_by_value |
target_column | rings
column_number | 9
data_type | int4
row_count | 4177
distinct_values | 28
missing_values | 0
blank_values |
fraction_missing | 0
fraction_blank |
positive_values | 4177
negative_values | 0
zero_values | 0
mean | 9.93368446253292
variance | 10.3952659473471
confidence_interval | {9.83590635305212,10.0314625720137}
min | 1
max | 29
first_quartile | 8
median | 9
third_quartile | 11
most_frequent_values | {9,10,8,11,7,12,6,13,14,5}
mfv_frequencies | {689,634,568,487,391,267,259,203,126,115}
-[ RECORD 10
]-------+------------------------------------------------------------------
group_by |
group_by_value |
target_column | age
column_number | 10
data_type | numeric
row_count | 4177
distinct_values | 28
missing_values | 0
blank_values |
fraction_missing | 0
fraction_blank |
positive_values | 4177
negative_values | 0
zero_values | 0
mean | 11.4336844625329
variance | 10.3952659473471
confidence_interval | {11.3359063530521,11.5314625720137}
min | 2.5
max | 30.5
first_quartile | 9.5
median | 10.5
third_quartile | 12.5
most_frequent_values | {10.5,11.5,9.5,12.5,8.5,13.5,7.5,14.5,15.5,6.5}
mfv_frequencies | {689,634,568,487,391,267,259,203,126,115}
-[ RECORD 11
]-------+------------------------------------------------------------------
group_by |
group_by_value |
target_column | mature
column_number | 11
data_type | int4
row_count | 4177
distinct_values | 2
missing_values | 0
blank_values |
fraction_missing | 0
fraction_blank |
positive_values | 2770
negative_values | 0
zero_values | 1407
mean | 0.663155374670816
variance | 0.223433815172854
confidence_interval | {0.648820355293342,0.67749039404829}
min | 0
max | 1
first_quartile | 0
median | 1
third_quartile | 1
most_frequent_values | {1,0}
mfv_frequencies | {2770,1407}
-[ RECORD 12
]-------+------------------------------------------------------------------
group_by |
group_by_value |
target_column | sex_f
column_number | 12
data_type | int4
row_count | 4177
distinct_values | 2
missing_values | 0
blank_values |
fraction_missing | 0
fraction_blank |
positive_values | 1307
negative_values | 0
zero_values | 2870
mean | 0.31290399808475
variance | 0.21504656956495
confidence_interval | {0.298840605732167,0.326967390437333}
min | 0
max | 1
first_quartile | 0
median | 0
third_quartile | 1
most_frequent_values | {0,1}
mfv_frequencies | {2870,1307}
-[ RECORD 13
]-------+------------------------------------------------------------------
group_by |
group_by_value |
target_column | sex_i
column_number | 13
data_type | int4
row_count | 4177
distinct_values | 2
missing_values | 0
blank_values |
fraction_missing | 0
fraction_blank |
positive_values | 1342
negative_values | 0
zero_values | 2835
mean | 0.321283217620302
variance | 0.218112529203438
confidence_interval | {0.30711992784858,0.335446507392023}
min | 0
max | 1
first_quartile | 0
median | 0
third_quartile | 1
most_frequent_values | {0,1}
mfv_frequencies | {2835,1342}
-[ RECORD 14
]-------+------------------------------------------------------------------
group_by |
group_by_value |
target_column | sex_m
column_number | 14
data_type | int4
row_count | 4177
distinct_values | 2
missing_values | 0
blank_values |
fraction_missing | 0
fraction_blank |
positive_values | 1528
negative_values | 0
zero_values | 2649
mean | 0.365812784294949
variance | 0.232049345210086
confidence_interval | {0.351204002328337,0.380421566261561}
min | 0
max | 1
first_quartile | 0
median | 0
third_quartile | 1
most_frequent_values | {0,1}
mfv_frequencies | {2649,1528}
```
which seems OK to me after spot checking some of the counts.
> Summary function has dups for MFV for approximate results
> ---------------------------------------------------------
>
> Key: MADLIB-1412
> URL: https://issues.apache.org/jira/browse/MADLIB-1412
> Project: Apache MADlib
> Issue Type: Improvement
> Components: Module: Descriptive Statistics
> Reporter: Frank McQuillan
> Priority: Major
> Fix For: v1.17
>
>
> Recently I was using
> http://madlib.apache.org/docs/latest/group__grp__summary.html
> and noticed some behavior that I had not seen before. E.g., on the
> abalone dataset the most frequent values and counts seem to be duplicated.
> Starting with this dataset:
> {code}
> SELECT * FROM abalone_encoded LIMIT 5;
> id | length | diameter | height | whole_weight | shucked_weight |
> viscera_weight | shell_weight | rings | age | mature | sex_f | sex_i | sex_m
> ----+--------+----------+--------+--------------+----------------+----------------+--------------+-------+------+--------+-------+-------+-------
> 3 | 0.53 | 0.42 | 0.135 | 0.677 | 0.2565 |
> 0.1415 | 0.21 | 9 | 10.5 | 1 | 1 | 0 | 0
> 11 | 0.525 | 0.38 | 0.14 | 0.6065 | 0.194 |
> 0.1475 | 0.21 | 14 | 15.5 | 1 | 1 | 0 | 0
> 19 | 0.365 | 0.295 | 0.08 | 0.2555 | 0.097 |
> 0.043 | 0.1 | 7 | 8.5 | 0 | 0 | 0 | 1
> 27 | 0.58 | 0.45 | 0.185 | 0.9955 | 0.3945 |
> 0.272 | 0.285 | 11 | 12.5 | 1 | 1 | 0 | 0
> 35 | 0.705 | 0.55 | 0.2 | 1.7095 | 0.633 |
> 0.4115 | 0.49 | 13 | 14.5 | 1 | 1 | 0 | 0
> (5 rows)
> {code}
> (1)
> approximate method
> {code}
> DROP TABLE IF EXISTS abalone_summary ;
> SELECT
> madlib.summary (
> 'abalone_encoded', -- source_table,
> 'abalone_summary', -- output_table,
> NULL, -- target_cols,
> NULL -- grouping_cols
> )
> SELECT * FROM abalone_summary LIMIT 15;
> -[ RECORD 1
> ]--------+------------------------------------------------------------------
> group_by |
> group_by_value |
> target_column | id
> column_number | 1
> data_type | int4
> row_count | 4177
> distinct_values | 4177
> missing_values | 0
> blank_values |
> fraction_missing | 0
> fraction_blank |
> positive_values | 4177
> negative_values | 0
> zero_values | 0
> mean | 2089
> variance | 1454292.16666667
> confidence_interval | {2052.42791957426,2125.57208042574}
> min | 1
> max | 4177
> first_quartile | 1045
> median | 2089
> third_quartile | 3133
> most_frequent_values | {3453,2468,2270,4117,4152,2274,3811,2364,2174,2737}
> mfv_frequencies | {5,5,5,5,4,4,4,4,4,4}
> -[ RECORD 2
> ]--------+------------------------------------------------------------------
> group_by |
> group_by_value |
> target_column | length
> column_number | 2
> data_type | float8
> row_count | 4177
> distinct_values | 134
> missing_values | 0
> blank_values |
> fraction_missing | 0
> fraction_blank |
> positive_values | 4177
> negative_values | 0
> zero_values | 0
> mean | 0.52399209959301
> variance | 0.0144223076482969
> confidence_interval | {0.520350088942874,0.527634110243145}
> min | 0.075
> max | 0.815
> first_quartile | 0.45
> median | 0.545
> third_quartile | 0.615
> most_frequent_values | {0.55,0.625,0.625,0.55,0.575,0.575,0.58,0.58,0.6,0.62}
> mfv_frequencies | {94,94,94,94,93,93,92,92,87,87}
> -[ RECORD 3
> ]--------+------------------------------------------------------------------
> group_by |
> group_by_value |
> target_column | diameter
> column_number | 3
> data_type | float8
> row_count | 4177
> distinct_values | 111
> missing_values | 0
> blank_values |
> fraction_missing | 0
> fraction_blank |
> positive_values | 4177
> negative_values | 0
> zero_values | 0
> mean | 0.407881254488868
> variance | 0.00984855103022602
> confidence_interval | {0.404871645997761,0.410890862979975}
> min | 0.055
> max | 0.65
> first_quartile | 0.35
> median | 0.425
> third_quartile | 0.48
> most_frequent_values | {0.45,0.45,0.475,0.475,0.4,0.4,0.5,0.5,0.47,0.47}
> mfv_frequencies | {139,139,120,120,111,111,110,110,100,100}
> -[ RECORD 4
> ]--------+------------------------------------------------------------------
> group_by |
> group_by_value |
> target_column | height
> column_number | 4
> data_type | float8
> row_count | 4177
> distinct_values | 51
> missing_values | 0
> blank_values |
> fraction_missing | 0
> fraction_blank |
> positive_values | 4175
> negative_values | 0
> zero_values | 2
> mean | 0.139516399329662
> variance | 0.00174950266442675
> confidence_interval | {0.138247926591563,0.140784872067762}
> min | 0
> max | 1.13
> first_quartile | 0.115
> median | 0.14
> third_quartile | 0.165
> most_frequent_values | {0.15,0.15,0.14,0.14,0.155,0.155,0.175,0.175,0.16,0.16}
> mfv_frequencies | {267,267,220,220,217,217,211,211,205,205}
> -[ RECORD 5
> ]--------+------------------------------------------------------------------
> group_by |
> group_by_value |
> target_column | whole_weight
> column_number | 5
> data_type | float8
> row_count | 4177
> distinct_values | 2429
> missing_values | 0
> blank_values |
> fraction_missing | 0
> fraction_blank |
> positive_values | 4177
> negative_values | 0
> zero_values | 0
> mean | 0.828742159444577
> variance | 0.240481389201562
> confidence_interval | {0.813870324055099,0.843613994834056}
> min | 0.002
> max | 2.8255
> first_quartile | 0.4415
> median | 0.7995
> third_quartile | 1.153
> most_frequent_values |
> {1.1345,0.2225,0.196,1.1155,0.97,0.4775,0.494,0.874,1.229,0.6855}
> mfv_frequencies | {11,8,8,7,7,7,7,7,7,6}
> -[ RECORD 6
> ]--------+------------------------------------------------------------------
> group_by |
> group_by_value |
> target_column | shucked_weight
> column_number | 6
> data_type | float8
> row_count | 4177
> distinct_values | 1515
> missing_values | 0
> blank_values |
> fraction_missing | 0
> fraction_blank |
> positive_values | 4177
> negative_values | 0
> zero_values | 0
> mean | 0.359367488628203
> variance | 0.0492675507435237
> confidence_interval | {0.352636105342963,0.366098871913442}
> min | 0.001
> max | 1.488
> first_quartile | 0.186
> median | 0.336
> third_quartile | 0.502
> most_frequent_values |
> {0.175,0.2505,0.2025,0.097,0.096,0.302,0.2945,0.25,0.261,0.358}
> mfv_frequencies | {11,10,9,9,9,9,9,8,8,8}
> -[ RECORD 7
> ]--------+------------------------------------------------------------------
> group_by |
> group_by_value |
> target_column | viscera_weight
> column_number | 7
> data_type | float8
> row_count | 4177
> distinct_values | 880
> missing_values | 0
> blank_values |
> fraction_missing | 0
> fraction_blank |
> positive_values | 4177
> negative_values | 0
> zero_values | 0
> mean | 0.180593607852526
> variance | 0.0120152838599927
> confidence_interval | {0.17726937948362,0.183917836221431}
> min | 0.0005
> max | 0.76
> first_quartile | 0.0935
> median | 0.171
> third_quartile | 0.253
> most_frequent_values |
> {0.1715,0.196,0.061,0.037,0.2195,0.1905,0.207,0.1405,0.159,0.156}
> mfv_frequencies | {15,14,13,13,13,12,12,12,12,12}
> -[ RECORD 8
> ]--------+------------------------------------------------------------------
> group_by |
> group_by_value |
> target_column | shell_weight
> column_number | 8
> data_type | float8
> row_count | 4177
> distinct_values | 926
> missing_values | 0
> blank_values |
> fraction_missing | 0
> fraction_blank |
> positive_values | 4177
> negative_values | 0
> zero_values | 0
> mean | 0.238830859468518
> variance | 0.0193773832021586
> confidence_interval | {0.234609314715373,0.243052404221663}
> min | 0.0015
> max | 1.005
> first_quartile | 0.13
> median | 0.234
> third_quartile | 0.329
> most_frequent_values |
> {0.275,0.275,0.25,0.265,0.315,0.185,0.265,0.315,0.285,0.3}
> mfv_frequencies | {43,43,42,40,40,40,40,40,37,37}
> -[ RECORD 9
> ]--------+------------------------------------------------------------------
> group_by |
> group_by_value |
> target_column | rings
> column_number | 9
> data_type | int4
> row_count | 4177
> distinct_values | 28
> missing_values | 0
> blank_values |
> fraction_missing | 0
> fraction_blank |
> positive_values | 4177
> negative_values | 0
> zero_values | 0
> mean | 9.93368446253292
> variance | 10.3952659473471
> confidence_interval | {9.83590635305212,10.0314625720137}
> min | 1
> max | 29
> first_quartile | 8
> median | 9
> third_quartile | 11
> most_frequent_values | {9,9,10,10,8,8,11,11,7,7}
> mfv_frequencies | {689,689,634,634,568,568,487,487,391,391}
> -[ RECORD 10
> ]-------+------------------------------------------------------------------
> group_by |
> group_by_value |
> target_column | age
> column_number | 10
> data_type | numeric
> row_count | 4177
> distinct_values | 28
> missing_values | 0
> blank_values |
> fraction_missing | 0
> fraction_blank |
> positive_values | 4177
> negative_values | 0
> zero_values | 0
> mean | 11.4336844625329
> variance | 10.3952659473471
> confidence_interval | {11.3359063530521,11.5314625720137}
> min | 2.5
> max | 30.5
> first_quartile | 9.5
> median | 10.5
> third_quartile | 12.5
> most_frequent_values | {10.5,10.5,11.5,11.5,9.5,9.5,12.5,12.5,8.5,8.5}
> mfv_frequencies | {689,689,634,634,568,568,487,487,391,391}
> -[ RECORD 11
> ]-------+------------------------------------------------------------------
> group_by |
> group_by_value |
> target_column | mature
> column_number | 11
> data_type | int4
> row_count | 4177
> distinct_values | 2
> missing_values | 0
> blank_values |
> fraction_missing | 0
> fraction_blank |
> positive_values | 2770
> negative_values | 0
> zero_values | 1407
> mean | 0.663155374670816
> variance | 0.223433815172854
> confidence_interval | {0.648820355293342,0.67749039404829}
> min | 0
> max | 1
> first_quartile | 0
> median | 1
> third_quartile | 1
> most_frequent_values | {1,1,0,0}
> mfv_frequencies | {2770,2770,1407,1407}
> -[ RECORD 12
> ]-------+------------------------------------------------------------------
> group_by |
> group_by_value |
> target_column | sex_f
> column_number | 12
> data_type | int4
> row_count | 4177
> distinct_values | 2
> missing_values | 0
> blank_values |
> fraction_missing | 0
> fraction_blank |
> positive_values | 1307
> negative_values | 0
> zero_values | 2870
> mean | 0.31290399808475
> variance | 0.21504656956495
> confidence_interval | {0.298840605732167,0.326967390437333}
> min | 0
> max | 1
> first_quartile | 0
> median | 0
> third_quartile | 1
> most_frequent_values | {0,0,1,1}
> mfv_frequencies | {2870,2870,1307,1307}
> -[ RECORD 13
> ]-------+------------------------------------------------------------------
> group_by |
> group_by_value |
> target_column | sex_i
> column_number | 13
> data_type | int4
> row_count | 4177
> distinct_values | 2
> missing_values | 0
> blank_values |
> fraction_missing | 0
> fraction_blank |
> positive_values | 1342
> negative_values | 0
> zero_values | 2835
> mean | 0.321283217620302
> variance | 0.218112529203438
> confidence_interval | {0.30711992784858,0.335446507392023}
> min | 0
> max | 1
> first_quartile | 0
> median | 0
> third_quartile | 1
> most_frequent_values | {0,0,1,1}
> mfv_frequencies | {2835,2835,1342,1342}
> -[ RECORD 14
> ]-------+------------------------------------------------------------------
> group_by |
> group_by_value |
> target_column | sex_m
> column_number | 14
> data_type | int4
> row_count | 4177
> distinct_values | 2
> missing_values | 0
> blank_values |
> fraction_missing | 0
> fraction_blank |
> positive_values | 1528
> negative_values | 0
> zero_values | 2649
> mean | 0.365812784294949
> variance | 0.232049345210086
> confidence_interval | {0.351204002328337,0.380421566261561}
> min | 0
> max | 1
> first_quartile | 0
> median | 0
> third_quartile | 1
> most_frequent_values | {0,0,1,1}
> mfv_frequencies | {2649,2649,1528,1528}
> {code}
> Notice the dups, e.g., for the field `age`
> {code}
> most_frequent_values | {10.5,10.5,11.5,11.5,9.5,9.5,12.5,12.5,8.5,8.5}
> mfv_frequencies | {689,689,634,634,568,568,487,487,391,391}
> {code}
> (2)
> exact method
> For comparison if you do
> {code}
> DROP TABLE IF EXISTS abalone_summary_exact;
> SELECT madlib.summary (
> 'abalone_encoded', -- source_table
> 'abalone_summary_exact', -- output_table
> NULL, -- target_cols
> NULL, -- grouping_cols
> TRUE, -- get_distinct
> TRUE, -- get_quartiles
> NULL, -- quantile_array
> 10, -- how_many_mfv
> FALSE, -- get_estimate
> 15 -- n_cols_per_run
> );
> {code}
> you get
> {code}
> SELECT * FROM abalone_summary_exact;
> -[ RECORD 1
> ]--------+------------------------------------------------------------------
> group_by |
> group_by_value |
> target_column | id
> column_number | 1
> data_type | int4
> row_count | 4177
> distinct_values | 4177
> missing_values | 0
> blank_values |
> fraction_missing | 0
> fraction_blank |
> positive_values | 4177
> negative_values | 0
> zero_values | 0
> mean | 2089
> variance | 1454292.16666667
> confidence_interval | {2052.42791957426,2125.57208042574}
> min | 1
> max | 4177
> first_quartile | 1045
> median | 2089
> third_quartile | 3133
> most_frequent_values | {4117,4114,2914,3185,3348,3114,3283,3362,3377,3593}
> mfv_frequencies | {5,5,4,4,4,4,4,4,4,4}
> -[ RECORD 2
> ]--------+------------------------------------------------------------------
> group_by |
> group_by_value |
> target_column | length
> column_number | 2
> data_type | float8
> row_count | 4177
> distinct_values | 134
> missing_values | 0
> blank_values |
> fraction_missing | 0
> fraction_blank |
> positive_values | 4177
> negative_values | 0
> zero_values | 0
> mean | 0.523992099593009
> variance | 0.0144223076482971
> confidence_interval | {0.520350088942874,0.527634110243145}
> min | 0.075
> max | 0.815
> first_quartile | 0.45
> median | 0.545
> third_quartile | 0.615
> most_frequent_values | {0.55,0.625,0.575,0.58,0.6,0.62,0.5,0.57,0.63,0.61}
> mfv_frequencies | {94,94,93,92,87,87,81,79,78,75}
> -[ RECORD 3
> ]--------+------------------------------------------------------------------
> group_by |
> group_by_value |
> target_column | diameter
> column_number | 3
> data_type | float8
> row_count | 4177
> distinct_values | 111
> missing_values | 0
> blank_values |
> fraction_missing | 0
> fraction_blank |
> positive_values | 4177
> negative_values | 0
> zero_values | 0
> mean | 0.407881254488869
> variance | 0.00984855103022455
> confidence_interval | {0.404871645997762,0.410890862979976}
> min | 0.055
> max | 0.65
> first_quartile | 0.35
> median | 0.425
> third_quartile | 0.48
> most_frequent_values | {0.45,0.475,0.4,0.5,0.47,0.48,0.455,0.46,0.44,0.485}
> mfv_frequencies | {139,120,111,110,100,91,90,89,87,83}
> -[ RECORD 4
> ]--------+------------------------------------------------------------------
> group_by |
> group_by_value |
> target_column | height
> column_number | 4
> data_type | float8
> row_count | 4177
> distinct_values | 51
> missing_values | 0
> blank_values |
> fraction_missing | 0
> fraction_blank |
> positive_values | 4175
> negative_values | 0
> zero_values | 2
> mean | 0.139516399329661
> variance | 0.00174950266442686
> confidence_interval | {0.138247926591562,0.140784872067761}
> min | 0
> max | 1.13
> first_quartile | 0.115
> median | 0.14
> third_quartile | 0.165
> most_frequent_values |
> {0.15,0.14,0.155,0.175,0.16,0.125,0.165,0.135,0.145,0.12}
> mfv_frequencies | {267,220,217,211,205,202,193,189,182,169}
> -[ RECORD 5
> ]--------+------------------------------------------------------------------
> group_by |
> group_by_value |
> target_column | whole_weight
> column_number | 5
> data_type | float8
> row_count | 4177
> distinct_values | 2429
> missing_values | 0
> blank_values |
> fraction_missing | 0
> fraction_blank |
> positive_values | 4177
> negative_values | 0
> zero_values | 0
> mean | 0.82874215944458
> variance | 0.240481389201557
> confidence_interval | {0.813870324055102,0.843613994834058}
> min | 0.002
> max | 2.8255
> first_quartile | 0.4415
> median | 0.7995
> third_quartile | 1.153
> most_frequent_values |
> {1.1345,0.2225,0.196,0.44,0.872,1.0835,0.4775,0.4425,1.1155,0.97}
> mfv_frequencies | {9,8,8,7,7,7,7,7,7,7}
> -[ RECORD 6
> ]--------+------------------------------------------------------------------
> group_by |
> group_by_value |
> target_column | shucked_weight
> column_number | 6
> data_type | float8
> row_count | 4177
> distinct_values | 1515
> missing_values | 0
> blank_values |
> fraction_missing | 0
> fraction_blank |
> positive_values | 4177
> negative_values | 0
> zero_values | 0
> mean | 0.359367488628202
> variance | 0.0492675507435239
> confidence_interval | {0.352636105342962,0.366098871913441}
> min | 0.001
> max | 1.488
> first_quartile | 0.186
> median | 0.336
> third_quartile | 0.502
> most_frequent_values |
> {0.175,0.2505,0.0745,0.2945,0.2025,0.097,0.165,0.302,0.419,0.2}
> mfv_frequencies | {11,10,10,9,9,9,9,9,9,9}
> -[ RECORD 7
> ]--------+------------------------------------------------------------------
> group_by |
> group_by_value |
> target_column | viscera_weight
> column_number | 7
> data_type | float8
> row_count | 4177
> distinct_values | 880
> missing_values | 0
> blank_values |
> fraction_missing | 0
> fraction_blank |
> positive_values | 4177
> negative_values | 0
> zero_values | 0
> mean | 0.180593607852526
> variance | 0.0120152838599928
> confidence_interval | {0.177269379483621,0.183917836221431}
> min | 0.0005
> max | 0.76
> first_quartile | 0.0935
> median | 0.171
> third_quartile | 0.253
> most_frequent_values |
> {0.1715,0.196,0.061,0.037,0.2195,0.0575,0.0265,0.096,0.15,0.1905}
> mfv_frequencies | {15,14,13,13,13,13,12,12,12,12}
> -[ RECORD 8
> ]--------+------------------------------------------------------------------
> group_by |
> group_by_value |
> target_column | shell_weight
> column_number | 8
> data_type | float8
> row_count | 4177
> distinct_values | 926
> missing_values | 0
> blank_values |
> fraction_missing | 0
> fraction_blank |
> positive_values | 4177
> negative_values | 0
> zero_values | 0
> mean | 0.238830859468518
> variance | 0.0193773832021586
> confidence_interval | {0.234609314715373,0.243052404221663}
> min | 0.0015
> max | 1.005
> first_quartile | 0.13
> median | 0.234
> third_quartile | 0.329
> most_frequent_values |
> {0.275,0.25,0.185,0.265,0.315,0.3,0.17,0.285,0.175,0.22}
> mfv_frequencies | {43,42,40,40,40,37,37,37,36,36}
> -[ RECORD 9
> ]--------+------------------------------------------------------------------
> group_by |
> group_by_value |
> target_column | rings
> column_number | 9
> data_type | int4
> row_count | 4177
> distinct_values | 28
> missing_values | 0
> blank_values |
> fraction_missing | 0
> fraction_blank |
> positive_values | 4177
> negative_values | 0
> zero_values | 0
> mean | 9.93368446253292
> variance | 10.3952659473471
> confidence_interval | {9.83590635305212,10.0314625720137}
> min | 1
> max | 29
> first_quartile | 8
> median | 9
> third_quartile | 11
> most_frequent_values | {9,10,8,11,7,12,6,13,14,5}
> mfv_frequencies | {689,634,568,487,391,267,259,203,126,115}
> -[ RECORD 10
> ]-------+------------------------------------------------------------------
> group_by |
> group_by_value |
> target_column | age
> column_number | 10
> data_type | numeric
> row_count | 4177
> distinct_values | 28
> missing_values | 0
> blank_values |
> fraction_missing | 0
> fraction_blank |
> positive_values | 4177
> negative_values | 0
> zero_values | 0
> mean | 11.4336844625329
> variance | 10.3952659473471
> confidence_interval | {11.3359063530521,11.5314625720137}
> min | 2.5
> max | 30.5
> first_quartile | 9.5
> median | 10.5
> third_quartile | 12.5
> most_frequent_values | {10.5,11.5,9.5,12.5,8.5,13.5,7.5,14.5,15.5,6.5}
> mfv_frequencies | {689,634,568,487,391,267,259,203,126,115}
> -[ RECORD 11
> ]-------+------------------------------------------------------------------
> group_by |
> group_by_value |
> target_column | mature
> column_number | 11
> data_type | int4
> row_count | 4177
> distinct_values | 2
> missing_values | 0
> blank_values |
> fraction_missing | 0
> fraction_blank |
> positive_values | 2770
> negative_values | 0
> zero_values | 1407
> mean | 0.663155374670816
> variance | 0.223433815172854
> confidence_interval | {0.648820355293342,0.67749039404829}
> min | 0
> max | 1
> first_quartile | 0
> median | 1
> third_quartile | 1
> most_frequent_values | {1,0}
> mfv_frequencies | {2770,1407}
> -[ RECORD 12
> ]-------+------------------------------------------------------------------
> group_by |
> group_by_value |
> target_column | sex_f
> column_number | 12
> data_type | int4
> row_count | 4177
> distinct_values | 2
> missing_values | 0
> blank_values |
> fraction_missing | 0
> fraction_blank |
> positive_values | 1307
> negative_values | 0
> zero_values | 2870
> mean | 0.31290399808475
> variance | 0.21504656956495
> confidence_interval | {0.298840605732167,0.326967390437333}
> min | 0
> max | 1
> first_quartile | 0
> median | 0
> third_quartile | 1
> most_frequent_values | {0,1}
> mfv_frequencies | {2870,1307}
> -[ RECORD 13
> ]-------+------------------------------------------------------------------
> group_by |
> group_by_value |
> target_column | sex_i
> column_number | 13
> data_type | int4
> row_count | 4177
> distinct_values | 2
> missing_values | 0
> blank_values |
> fraction_missing | 0
> fraction_blank |
> positive_values | 1342
> negative_values | 0
> zero_values | 2835
> mean | 0.321283217620302
> variance | 0.218112529203438
> confidence_interval | {0.30711992784858,0.335446507392023}
> min | 0
> max | 1
> first_quartile | 0
> median | 0
> third_quartile | 1
> most_frequent_values | {0,1}
> mfv_frequencies | {2835,1342}
> -[ RECORD 14
> ]-------+------------------------------------------------------------------
> group_by |
> group_by_value |
> target_column | sex_m
> column_number | 14
> data_type | int4
> row_count | 4177
> distinct_values | 2
> missing_values | 0
> blank_values |
> fraction_missing | 0
> fraction_blank |
> positive_values | 1528
> negative_values | 0
> zero_values | 2649
> mean | 0.365812784294949
> variance | 0.232049345210086
> confidence_interval | {0.351204002328337,0.380421566261561}
> min | 0
> max | 1
> first_quartile | 0
> median | 0
> third_quartile | 1
> most_frequent_values | {0,1}
> mfv_frequencies | {2649,1528}
> {code}
> which seems OK to me after spot checking some of the counts.
--
This message was sent by Atlassian Jira
(v8.3.4#803005)