[
https://issues.apache.org/jira/browse/MADLIB-1295?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
]
Frank McQuillan updated MADLIB-1295:
------------------------------------
Description:
From
http://madlib.apache.org/docs/latest/group__grp__encode__categorical.html
"all Boolean, integer and text columns are considered categorical columns and
will be encoded when ‘*’ is specified for this argument."
It works for int and smallint, however does not work for bigint:
1) int - OK
{code}
DROP TABLE IF EXISTS abalone;
CREATE TABLE abalone (
id serial,
sex character varying,
length double precision,
diameter double precision,
height double precision,
rings int
);
INSERT INTO abalone (sex, length, diameter, height, rings) VALUES
('M', 0.455, 0.365, 0.095, 15),
('M', 0.35, 0.265, 0.09, 7),
('F', 0.53, 0.42, 0.135, 9),
('M', 0.44, 0.365, 0.125, 10),
('I', 0.33, 0.255, 0.08, 7),
('I', 0.425, 0.3, 0.095, 8),
('F', 0.53, 0.415, 0.15, 20),
('F', 0.545, 0.425, 0.125, 16),
('M', 0.475, 0.37, 0.125, 9),
(null, 0.55, 0.44, 0.15, 19),
('F', 0.525, 0.38, 0.14, 14),
('M', 0.43, 0.35, 0.11, 10),
('M', 0.49, 0.38, 0.135, 11),
('F', 0.535, 0.405, 0.145, 10),
('F', 0.47, 0.355, 0.1, 10),
('M', 0.5, 0.4, 0.13, 12),
('I', 0.355, 0.28, 0.085, 7),
('F', 0.44, 0.34, 0.1, 10),
('M', 0.365, 0.295, 0.08, 7),
(null, 0.45, 0.32, 0.1, 9);
{code}
{code}
DROP TABLE IF EXISTS abalone_out, abalone_out_dictionary;
SELECT madlib.encode_categorical_variables (
'abalone', -- Source table
'abalone_out', -- Output table
'*', -- Categorical columns
NULL, -- Categorical columns to exclude
'id' -- Index columns
);
SELECT * FROM abalone_out ORDER BY id;
{code}
{code}
id | sex_F | sex_I | sex_M | rings_10 | rings_11 | rings_12 | rings_14 |
rings_15 | rings_16 | rings_19 | rings_20 | rings_7 | rings_8 | rings_9
----+-------+-------+-------+----------+----------+----------+----------+----------+----------+----------+----------+---------+---------+---------
1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
1 | 0 | 0 | 0 | 0 | 0 | 0
2 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 1 | 0 | 0
3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 1
4 | 0 | 0 | 1 | 1 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0
5 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 1 | 0 | 0
6 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 1 | 0
7 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 1 | 0 | 0 | 0
8 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 1 | 0 | 0 | 0 | 0 | 0
9 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 1
10 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 1 | 0 | 0 | 0 | 0
11 | 1 | 0 | 0 | 0 | 0 | 0 | 1 |
0 | 0 | 0 | 0 | 0 | 0 | 0
12 | 0 | 0 | 1 | 1 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0
13 | 0 | 0 | 1 | 0 | 1 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0
14 | 1 | 0 | 0 | 1 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0
15 | 1 | 0 | 0 | 1 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0
16 | 0 | 0 | 1 | 0 | 0 | 1 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0
17 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 1 | 0 | 0
18 | 1 | 0 | 0 | 1 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0
19 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 1 | 0 | 0
20 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 1
(20 rows)
{code}
2) bigint - not OK
{code}
DROP TABLE IF EXISTS abalone;
CREATE TABLE abalone (
id serial,
sex character varying,
length double precision,
diameter double precision,
height double precision,
rings bigint
);
INSERT INTO abalone (sex, length, diameter, height, rings) VALUES
('M', 0.455, 0.365, 0.095, 15),
('M', 0.35, 0.265, 0.09, 7),
('F', 0.53, 0.42, 0.135, 9),
('M', 0.44, 0.365, 0.125, 10),
('I', 0.33, 0.255, 0.08, 7),
('I', 0.425, 0.3, 0.095, 8),
('F', 0.53, 0.415, 0.15, 20),
('F', 0.545, 0.425, 0.125, 16),
('M', 0.475, 0.37, 0.125, 9),
(null, 0.55, 0.44, 0.15, 19),
('F', 0.525, 0.38, 0.14, 14),
('M', 0.43, 0.35, 0.11, 10),
('M', 0.49, 0.38, 0.135, 11),
('F', 0.535, 0.405, 0.145, 10),
('F', 0.47, 0.355, 0.1, 10),
('M', 0.5, 0.4, 0.13, 12),
('I', 0.355, 0.28, 0.085, 7),
('F', 0.44, 0.34, 0.1, 10),
('M', 0.365, 0.295, 0.08, 7),
(null, 0.45, 0.32, 0.1, 9);
{code}
{code}
DROP TABLE IF EXISTS abalone_out, abalone_out_dictionary;
SELECT madlib.encode_categorical_variables (
'abalone', -- Source table
'abalone_out', -- Output table
'*', -- Categorical columns
NULL, -- Categorical columns to exclude
'id' -- Index columns
);
SELECT * FROM abalone_out ORDER BY id;
{code}
{code}
id | sex_F | sex_I | sex_M
----+-------+-------+-------
1 | 0 | 0 | 1
2 | 0 | 0 | 1
3 | 1 | 0 | 0
4 | 0 | 0 | 1
5 | 0 | 1 | 0
6 | 0 | 1 | 0
7 | 1 | 0 | 0
8 | 1 | 0 | 0
9 | 0 | 0 | 1
10 | 0 | 0 | 0
11 | 1 | 0 | 0
12 | 0 | 0 | 1
13 | 0 | 0 | 1
14 | 1 | 0 | 0
15 | 1 | 0 | 0
16 | 0 | 0 | 1
17 | 0 | 1 | 0
18 | 1 | 0 | 0
19 | 0 | 0 | 1
20 | 0 | 0 | 0
(20 rows)
{code}
so it is not encoding rings.
Also check other places in this module for similar issues with `bigint` .
was:
From
http://madlib.apache.org/docs/latest/group__grp__encode__categorical.html
"all Boolean, integer and text columns are considered categorical columns and
will be encoded when ‘*’ is specified for this argument."
It works for int and smallint, however does not work for bigint:
1) int - OK
{code}
DROP TABLE IF EXISTS abalone;
CREATE TABLE abalone (
id serial,
sex character varying,
length double precision,
diameter double precision,
height double precision,
rings int
);
INSERT INTO abalone (sex, length, diameter, height, rings) VALUES
('M', 0.455, 0.365, 0.095, 15),
('M', 0.35, 0.265, 0.09, 7),
('F', 0.53, 0.42, 0.135, 9),
('M', 0.44, 0.365, 0.125, 10),
('I', 0.33, 0.255, 0.08, 7),
('I', 0.425, 0.3, 0.095, 8),
('F', 0.53, 0.415, 0.15, 20),
('F', 0.545, 0.425, 0.125, 16),
('M', 0.475, 0.37, 0.125, 9),
(null, 0.55, 0.44, 0.15, 19),
('F', 0.525, 0.38, 0.14, 14),
('M', 0.43, 0.35, 0.11, 10),
('M', 0.49, 0.38, 0.135, 11),
('F', 0.535, 0.405, 0.145, 10),
('F', 0.47, 0.355, 0.1, 10),
('M', 0.5, 0.4, 0.13, 12),
('I', 0.355, 0.28, 0.085, 7),
('F', 0.44, 0.34, 0.1, 10),
('M', 0.365, 0.295, 0.08, 7),
(null, 0.45, 0.32, 0.1, 9);
{code}
{code}
DROP TABLE IF EXISTS abalone_out, abalone_out_dictionary;
SELECT madlib.encode_categorical_variables (
'abalone', -- Source table
'abalone_out', -- Output table
'*', -- Categorical columns
NULL, -- Categorical columns to exclude
'id' -- Index columns
);
SELECT * FROM abalone_out ORDER BY id;
{code}
{code}
id | sex_F | sex_I | sex_M | rings_10 | rings_11 | rings_12 | rings_14 |
rings_15 | rings_16 | rings_19 | rings_20 | rings_7 | rings_8 | rings_9
----+-------+-------+-------+----------+----------+----------+----------+----------+----------+----------+----------+---------+---------+---------
1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
1 | 0 | 0 | 0 | 0 | 0 | 0
2 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 1 | 0 | 0
3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 1
4 | 0 | 0 | 1 | 1 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0
5 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 1 | 0 | 0
6 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 1 | 0
7 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 1 | 0 | 0 | 0
8 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 1 | 0 | 0 | 0 | 0 | 0
9 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 1
10 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 1 | 0 | 0 | 0 | 0
11 | 1 | 0 | 0 | 0 | 0 | 0 | 1 |
0 | 0 | 0 | 0 | 0 | 0 | 0
12 | 0 | 0 | 1 | 1 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0
13 | 0 | 0 | 1 | 0 | 1 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0
14 | 1 | 0 | 0 | 1 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0
15 | 1 | 0 | 0 | 1 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0
16 | 0 | 0 | 1 | 0 | 0 | 1 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0
17 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 1 | 0 | 0
18 | 1 | 0 | 0 | 1 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0
19 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 1 | 0 | 0
20 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 1
(20 rows)
{code}
2) bigint - not OK
{code}
DROP TABLE IF EXISTS abalone;
CREATE TABLE abalone (
id serial,
sex character varying,
length double precision,
diameter double precision,
height double precision,
rings bigint
);
INSERT INTO abalone (sex, length, diameter, height, rings) VALUES
('M', 0.455, 0.365, 0.095, 15),
('M', 0.35, 0.265, 0.09, 7),
('F', 0.53, 0.42, 0.135, 9),
('M', 0.44, 0.365, 0.125, 10),
('I', 0.33, 0.255, 0.08, 7),
('I', 0.425, 0.3, 0.095, 8),
('F', 0.53, 0.415, 0.15, 20),
('F', 0.545, 0.425, 0.125, 16),
('M', 0.475, 0.37, 0.125, 9),
(null, 0.55, 0.44, 0.15, 19),
('F', 0.525, 0.38, 0.14, 14),
('M', 0.43, 0.35, 0.11, 10),
('M', 0.49, 0.38, 0.135, 11),
('F', 0.535, 0.405, 0.145, 10),
('F', 0.47, 0.355, 0.1, 10),
('M', 0.5, 0.4, 0.13, 12),
('I', 0.355, 0.28, 0.085, 7),
('F', 0.44, 0.34, 0.1, 10),
('M', 0.365, 0.295, 0.08, 7),
(null, 0.45, 0.32, 0.1, 9);
{code}
{code}
DROP TABLE IF EXISTS abalone_out, abalone_out_dictionary;
SELECT madlib.encode_categorical_variables (
'abalone', -- Source table
'abalone_out', -- Output table
'*', -- Categorical columns
NULL, -- Categorical columns to exclude
'id' -- Index columns
);
SELECT * FROM abalone_out ORDER BY id;
{code}
{code}
id | sex_F | sex_I | sex_M
----+-------+-------+-------
1 | 0 | 0 | 1
2 | 0 | 0 | 1
3 | 1 | 0 | 0
4 | 0 | 0 | 1
5 | 0 | 1 | 0
6 | 0 | 1 | 0
7 | 1 | 0 | 0
8 | 1 | 0 | 0
9 | 0 | 0 | 1
10 | 0 | 0 | 0
11 | 1 | 0 | 0
12 | 0 | 0 | 1
13 | 0 | 0 | 1
14 | 1 | 0 | 0
15 | 1 | 0 | 0
16 | 0 | 0 | 1
17 | 0 | 1 | 0
18 | 1 | 0 | 0
19 | 0 | 0 | 1
20 | 0 | 0 | 0
(20 rows)
{code}
so it is not encoding rings.
> Encoding module is not handling bigint properly
> -----------------------------------------------
>
> Key: MADLIB-1295
> URL: https://issues.apache.org/jira/browse/MADLIB-1295
> Project: Apache MADlib
> Issue Type: Bug
> Components: Module: Utilities
> Reporter: Frank McQuillan
> Priority: Minor
> Fix For: v1.16
>
>
> From
> http://madlib.apache.org/docs/latest/group__grp__encode__categorical.html
> "all Boolean, integer and text columns are considered categorical columns and
> will be encoded when ‘*’ is specified for this argument."
> It works for int and smallint, however does not work for bigint:
> 1) int - OK
> {code}
> DROP TABLE IF EXISTS abalone;
> CREATE TABLE abalone (
> id serial,
> sex character varying,
> length double precision,
> diameter double precision,
> height double precision,
> rings int
> );
> INSERT INTO abalone (sex, length, diameter, height, rings) VALUES
> ('M', 0.455, 0.365, 0.095, 15),
> ('M', 0.35, 0.265, 0.09, 7),
> ('F', 0.53, 0.42, 0.135, 9),
> ('M', 0.44, 0.365, 0.125, 10),
> ('I', 0.33, 0.255, 0.08, 7),
> ('I', 0.425, 0.3, 0.095, 8),
> ('F', 0.53, 0.415, 0.15, 20),
> ('F', 0.545, 0.425, 0.125, 16),
> ('M', 0.475, 0.37, 0.125, 9),
> (null, 0.55, 0.44, 0.15, 19),
> ('F', 0.525, 0.38, 0.14, 14),
> ('M', 0.43, 0.35, 0.11, 10),
> ('M', 0.49, 0.38, 0.135, 11),
> ('F', 0.535, 0.405, 0.145, 10),
> ('F', 0.47, 0.355, 0.1, 10),
> ('M', 0.5, 0.4, 0.13, 12),
> ('I', 0.355, 0.28, 0.085, 7),
> ('F', 0.44, 0.34, 0.1, 10),
> ('M', 0.365, 0.295, 0.08, 7),
> (null, 0.45, 0.32, 0.1, 9);
> {code}
> {code}
> DROP TABLE IF EXISTS abalone_out, abalone_out_dictionary;
> SELECT madlib.encode_categorical_variables (
> 'abalone', -- Source table
> 'abalone_out', -- Output table
> '*', -- Categorical columns
> NULL, -- Categorical columns to exclude
> 'id' -- Index columns
> );
> SELECT * FROM abalone_out ORDER BY id;
> {code}
> {code}
> id | sex_F | sex_I | sex_M | rings_10 | rings_11 | rings_12 | rings_14 |
> rings_15 | rings_16 | rings_19 | rings_20 | rings_7 | rings_8 | rings_9
> ----+-------+-------+-------+----------+----------+----------+----------+----------+----------+----------+----------+---------+---------+---------
> 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
> 1 | 0 | 0 | 0 | 0 | 0 | 0
> 2 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
> 0 | 0 | 0 | 0 | 1 | 0 | 0
> 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
> 0 | 0 | 0 | 0 | 0 | 0 | 1
> 4 | 0 | 0 | 1 | 1 | 0 | 0 | 0 |
> 0 | 0 | 0 | 0 | 0 | 0 | 0
> 5 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
> 0 | 0 | 0 | 0 | 1 | 0 | 0
> 6 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
> 0 | 0 | 0 | 0 | 0 | 1 | 0
> 7 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
> 0 | 0 | 0 | 1 | 0 | 0 | 0
> 8 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
> 0 | 1 | 0 | 0 | 0 | 0 | 0
> 9 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
> 0 | 0 | 0 | 0 | 0 | 0 | 1
> 10 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
> 0 | 0 | 1 | 0 | 0 | 0 | 0
> 11 | 1 | 0 | 0 | 0 | 0 | 0 | 1 |
> 0 | 0 | 0 | 0 | 0 | 0 | 0
> 12 | 0 | 0 | 1 | 1 | 0 | 0 | 0 |
> 0 | 0 | 0 | 0 | 0 | 0 | 0
> 13 | 0 | 0 | 1 | 0 | 1 | 0 | 0 |
> 0 | 0 | 0 | 0 | 0 | 0 | 0
> 14 | 1 | 0 | 0 | 1 | 0 | 0 | 0 |
> 0 | 0 | 0 | 0 | 0 | 0 | 0
> 15 | 1 | 0 | 0 | 1 | 0 | 0 | 0 |
> 0 | 0 | 0 | 0 | 0 | 0 | 0
> 16 | 0 | 0 | 1 | 0 | 0 | 1 | 0 |
> 0 | 0 | 0 | 0 | 0 | 0 | 0
> 17 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
> 0 | 0 | 0 | 0 | 1 | 0 | 0
> 18 | 1 | 0 | 0 | 1 | 0 | 0 | 0 |
> 0 | 0 | 0 | 0 | 0 | 0 | 0
> 19 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
> 0 | 0 | 0 | 0 | 1 | 0 | 0
> 20 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
> 0 | 0 | 0 | 0 | 0 | 0 | 1
> (20 rows)
> {code}
> 2) bigint - not OK
> {code}
> DROP TABLE IF EXISTS abalone;
> CREATE TABLE abalone (
> id serial,
> sex character varying,
> length double precision,
> diameter double precision,
> height double precision,
> rings bigint
> );
> INSERT INTO abalone (sex, length, diameter, height, rings) VALUES
> ('M', 0.455, 0.365, 0.095, 15),
> ('M', 0.35, 0.265, 0.09, 7),
> ('F', 0.53, 0.42, 0.135, 9),
> ('M', 0.44, 0.365, 0.125, 10),
> ('I', 0.33, 0.255, 0.08, 7),
> ('I', 0.425, 0.3, 0.095, 8),
> ('F', 0.53, 0.415, 0.15, 20),
> ('F', 0.545, 0.425, 0.125, 16),
> ('M', 0.475, 0.37, 0.125, 9),
> (null, 0.55, 0.44, 0.15, 19),
> ('F', 0.525, 0.38, 0.14, 14),
> ('M', 0.43, 0.35, 0.11, 10),
> ('M', 0.49, 0.38, 0.135, 11),
> ('F', 0.535, 0.405, 0.145, 10),
> ('F', 0.47, 0.355, 0.1, 10),
> ('M', 0.5, 0.4, 0.13, 12),
> ('I', 0.355, 0.28, 0.085, 7),
> ('F', 0.44, 0.34, 0.1, 10),
> ('M', 0.365, 0.295, 0.08, 7),
> (null, 0.45, 0.32, 0.1, 9);
> {code}
> {code}
> DROP TABLE IF EXISTS abalone_out, abalone_out_dictionary;
> SELECT madlib.encode_categorical_variables (
> 'abalone', -- Source table
> 'abalone_out', -- Output table
> '*', -- Categorical columns
> NULL, -- Categorical columns to exclude
> 'id' -- Index columns
> );
> SELECT * FROM abalone_out ORDER BY id;
> {code}
> {code}
> id | sex_F | sex_I | sex_M
> ----+-------+-------+-------
> 1 | 0 | 0 | 1
> 2 | 0 | 0 | 1
> 3 | 1 | 0 | 0
> 4 | 0 | 0 | 1
> 5 | 0 | 1 | 0
> 6 | 0 | 1 | 0
> 7 | 1 | 0 | 0
> 8 | 1 | 0 | 0
> 9 | 0 | 0 | 1
> 10 | 0 | 0 | 0
> 11 | 1 | 0 | 0
> 12 | 0 | 0 | 1
> 13 | 0 | 0 | 1
> 14 | 1 | 0 | 0
> 15 | 1 | 0 | 0
> 16 | 0 | 0 | 1
> 17 | 0 | 1 | 0
> 18 | 1 | 0 | 0
> 19 | 0 | 0 | 1
> 20 | 0 | 0 | 0
> (20 rows)
> {code}
> so it is not encoding rings.
> Also check other places in this module for similar issues with `bigint` .
--
This message was sent by Atlassian JIRA
(v7.6.3#76005)