Frank McQuillan created MADLIB-1295:
---------------------------------------
Summary: Encoding module is not handling bigint properly
Key: MADLIB-1295
URL: https://issues.apache.org/jira/browse/MADLIB-1295
Project: Apache MADlib
Issue Type: Bug
Components: Module: Utilities
Reporter: Frank McQuillan
Fix For: v1.16
From
http://madlib.apache.org/docs/latest/group__grp__encode__categorical.html
"all Boolean, integer and text columns are considered categorical columns and
will be encoded when ‘*’ is specified for this argument."
It works for int, however does not work for bigint:
1) int - OK
{code}
DROP TABLE IF EXISTS abalone;
CREATE TABLE abalone (
id serial,
sex character varying,
length double precision,
diameter double precision,
height double precision,
rings int
);
INSERT INTO abalone (sex, length, diameter, height, rings) VALUES
('M', 0.455, 0.365, 0.095, 15),
('M', 0.35, 0.265, 0.09, 7),
('F', 0.53, 0.42, 0.135, 9),
('M', 0.44, 0.365, 0.125, 10),
('I', 0.33, 0.255, 0.08, 7),
('I', 0.425, 0.3, 0.095, 8),
('F', 0.53, 0.415, 0.15, 20),
('F', 0.545, 0.425, 0.125, 16),
('M', 0.475, 0.37, 0.125, 9),
(null, 0.55, 0.44, 0.15, 19),
('F', 0.525, 0.38, 0.14, 14),
('M', 0.43, 0.35, 0.11, 10),
('M', 0.49, 0.38, 0.135, 11),
('F', 0.535, 0.405, 0.145, 10),
('F', 0.47, 0.355, 0.1, 10),
('M', 0.5, 0.4, 0.13, 12),
('I', 0.355, 0.28, 0.085, 7),
('F', 0.44, 0.34, 0.1, 10),
('M', 0.365, 0.295, 0.08, 7),
(null, 0.45, 0.32, 0.1, 9);
{code}
{code}
DROP TABLE IF EXISTS abalone_out, abalone_out_dictionary;
SELECT madlib.encode_categorical_variables (
'abalone', -- Source table
'abalone_out', -- Output table
'*', -- Categorical columns
NULL, -- Categorical columns to exclude
'id' -- Index columns
);
SELECT * FROM abalone_out ORDER BY id;
{code}
{code}
id | sex_F | sex_I | sex_M | rings_10 | rings_11 | rings_12 | rings_14 |
rings_15 | rings_16 | rings_19 | rings_20 | rings_7 | rings_8 | rings_9
----+-------+-------+-------+----------+----------+----------+----------+----------+----------+----------+----------+---------+---------+---------
1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
1 | 0 | 0 | 0 | 0 | 0 | 0
2 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 1 | 0 | 0
3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 1
4 | 0 | 0 | 1 | 1 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0
5 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 1 | 0 | 0
6 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 1 | 0
7 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 1 | 0 | 0 | 0
8 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 1 | 0 | 0 | 0 | 0 | 0
9 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 1
10 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 1 | 0 | 0 | 0 | 0
11 | 1 | 0 | 0 | 0 | 0 | 0 | 1 |
0 | 0 | 0 | 0 | 0 | 0 | 0
12 | 0 | 0 | 1 | 1 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0
13 | 0 | 0 | 1 | 0 | 1 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0
14 | 1 | 0 | 0 | 1 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0
15 | 1 | 0 | 0 | 1 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0
16 | 0 | 0 | 1 | 0 | 0 | 1 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0
17 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 1 | 0 | 0
18 | 1 | 0 | 0 | 1 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0
19 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 1 | 0 | 0
20 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 1
(20 rows)
{code}
2) bigint - not OK
{code}
DROP TABLE IF EXISTS abalone;
CREATE TABLE abalone (
id serial,
sex character varying,
length double precision,
diameter double precision,
height double precision,
rings bigint
);
INSERT INTO abalone (sex, length, diameter, height, rings) VALUES
('M', 0.455, 0.365, 0.095, 15),
('M', 0.35, 0.265, 0.09, 7),
('F', 0.53, 0.42, 0.135, 9),
('M', 0.44, 0.365, 0.125, 10),
('I', 0.33, 0.255, 0.08, 7),
('I', 0.425, 0.3, 0.095, 8),
('F', 0.53, 0.415, 0.15, 20),
('F', 0.545, 0.425, 0.125, 16),
('M', 0.475, 0.37, 0.125, 9),
(null, 0.55, 0.44, 0.15, 19),
('F', 0.525, 0.38, 0.14, 14),
('M', 0.43, 0.35, 0.11, 10),
('M', 0.49, 0.38, 0.135, 11),
('F', 0.535, 0.405, 0.145, 10),
('F', 0.47, 0.355, 0.1, 10),
('M', 0.5, 0.4, 0.13, 12),
('I', 0.355, 0.28, 0.085, 7),
('F', 0.44, 0.34, 0.1, 10),
('M', 0.365, 0.295, 0.08, 7),
(null, 0.45, 0.32, 0.1, 9);
{code}
{code}
DROP TABLE IF EXISTS abalone_out, abalone_out_dictionary;
SELECT madlib.encode_categorical_variables (
'abalone', -- Source table
'abalone_out', -- Output table
'*', -- Categorical columns
NULL, -- Categorical columns to exclude
'id' -- Index columns
);
SELECT * FROM abalone_out ORDER BY id;
{code}
{code}
id | sex_F | sex_I | sex_M
----+-------+-------+-------
1 | 0 | 0 | 1
2 | 0 | 0 | 1
3 | 1 | 0 | 0
4 | 0 | 0 | 1
5 | 0 | 1 | 0
6 | 0 | 1 | 0
7 | 1 | 0 | 0
8 | 1 | 0 | 0
9 | 0 | 0 | 1
10 | 0 | 0 | 0
11 | 1 | 0 | 0
12 | 0 | 0 | 1
13 | 0 | 0 | 1
14 | 1 | 0 | 0
15 | 1 | 0 | 0
16 | 0 | 0 | 1
17 | 0 | 1 | 0
18 | 1 | 0 | 0
19 | 0 | 0 | 1
20 | 0 | 0 | 0
(20 rows)
{code}
so it is not encoding rings.
--
This message was sent by Atlassian JIRA
(v7.6.3#76005)