Encode categorical variables: handling special characters JIRA: MADLIB-1238 JIRA: MADLIB-1243
This commit deals with special characters in column name and column values. Also adds install check test cases to cover these scenarios. Co-Authored-by: Jingyi Mei <j...@pivotal.io> Co-Authored-by: Arvind Sridhar <asrid...@pivotal.io> Project: http://git-wip-us.apache.org/repos/asf/madlib/repo Commit: http://git-wip-us.apache.org/repos/asf/madlib/commit/d24cdfe1 Tree: http://git-wip-us.apache.org/repos/asf/madlib/tree/d24cdfe1 Diff: http://git-wip-us.apache.org/repos/asf/madlib/diff/d24cdfe1 Branch: refs/heads/test Commit: d24cdfe1dbdcfe8ba2379a70a52cafeeba994c0e Parents: 262e796 Author: Arvind Sridhar <asrid...@pivotal.io> Authored: Wed May 23 17:02:43 2018 -0700 Committer: Nikhil Kak <n...@pivotal.io> Committed: Wed Jun 20 12:04:00 2018 -0700 ---------------------------------------------------------------------- .../modules/utilities/encode_categorical.py_in | 8 ++--- .../utilities/test/encode_categorical.sql_in | 31 ++++++++++++++++++++ 2 files changed, 35 insertions(+), 4 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/madlib/blob/d24cdfe1/src/ports/postgres/modules/utilities/encode_categorical.py_in ---------------------------------------------------------------------- diff --git a/src/ports/postgres/modules/utilities/encode_categorical.py_in b/src/ports/postgres/modules/utilities/encode_categorical.py_in index 0d7eb91..cd08012 100644 --- a/src/ports/postgres/modules/utilities/encode_categorical.py_in +++ b/src/ports/postgres/modules/utilities/encode_categorical.py_in @@ -297,7 +297,7 @@ class CategoricalEncoder(object): # all values collected in a list are to be treated as a single # categorical factor if v: - non_null_v_str = ','.join(["'%s'" % (i) for i in v if i is not None]) + non_null_v_str = ','.join(["$__madlib__$%s$__madlib__$" % (i) for i in v if i is not None]) if non_null_v_str: value_str = "IN ({0})".format(non_null_v_str) if None in v: @@ -314,7 +314,7 @@ class CategoricalEncoder(object): cast_str = '' else: # assume v is a string if not list/tuple and not None - value_str = "= '{v}'".format(v=str(v)) + value_str = "= $__madlib__${v}$__madlib__$".format(v=str(v)) v_type = str cast_str = '::TEXT' @@ -380,7 +380,7 @@ class CategoricalEncoder(object): else: encoded_col_name = '"{col_no_quotes}_{seq}"' seq = local_seq - insert_template = "('%s', {seq}, '{col}', '{value_str}'::TEXT)" % (encoded_col_name) + insert_template = "('%s', {seq}, '{col}', $__madlib__${value_str}$__madlib__$::TEXT)" % (encoded_col_name) insert_values = [insert_template. format(col=col, col_no_quotes=col_no_quotes, @@ -461,7 +461,7 @@ class CategoricalEncoder(object): # get value distribution for each column independently top_val_sql_list.append(""" SELECT - '{col}' as col_name, + $__madlib__${col}$__madlib__$ as col_name, array_agg(f order by c desc) as value, array_agg(c order by c desc) as freq FROM ( http://git-wip-us.apache.org/repos/asf/madlib/blob/d24cdfe1/src/ports/postgres/modules/utilities/test/encode_categorical.sql_in ---------------------------------------------------------------------- diff --git a/src/ports/postgres/modules/utilities/test/encode_categorical.sql_in b/src/ports/postgres/modules/utilities/test/encode_categorical.sql_in index 31c27aa..7dc6169 100644 --- a/src/ports/postgres/modules/utilities/test/encode_categorical.sql_in +++ b/src/ports/postgres/modules/utilities/test/encode_categorical.sql_in @@ -100,3 +100,34 @@ select encode_categorical_variables('abalone', 'abalone_out6', ); select * from abalone_out6; select * from abalone_out6_dictionary order by variable, index; + +-- Test special charaters and unicode +DROP TABLE IF EXISTS abalone_special_char; +CREATE TABLE abalone_special_char ( + id serial, + "se$$''x" character varying, + "len$$'%*()gth" double precision, + diameter double precision, + height double precision, + "ClaÐss" integer +); +COPY abalone_special_char ("se$$''x", "len$$'%*()gth", diameter, height, "ClaÐss") FROM stdin WITH DELIMITER '|' NULL as '@'; +F"F|0.475|0.37|0.125|2 +F'F|0.55|0.44|0.15|0 +F$$,'}][{F|0.565|0.44|0.155|2 +MÐM|0.44|0.365|0.125|0 +M@[}(:*;M|0.475|0.37|0.125|2 +M,M|0.47|0.355|0.100|1 +'F'F'|0.55|0.44|0.15|0 +\. + +select encode_categorical_variables('abalone_special_char', 'abalone_special_char_out1', '"se$$''''x", "len$$''%*()gth"'); +select * from abalone_special_char_out1; + +select encode_categorical_variables('abalone_special_char', + 'abalone_special_char_out2', + '"se$$''''x", "ClaÐss"', '', + 'id', '3', 'claÐss=1', + true, 'svec', true); +select * from abalone_special_char_out2; +select * from abalone_special_char_out2_dictionary order by variable, index;