Github user iyerr3 commented on a diff in the pull request:
https://github.com/apache/incubator-madlib/pull/10#discussion_r52059035
--- Diff: src/ports/postgres/modules/svm/kernel_approximation.py_in ---
@@ -0,0 +1,481 @@
+from __future__ import division
+
+import plpy
+
+from utilities.utilities import unique_string
+from utilities.utilities import extract_keyvalue_params
+from utilities.utilities import num_features
+
+from math import sqrt
+from math import pi
+
+
+class GaussianKernelBase(object):
+ """docstring for gaussianKernel"""
+ def __init__(self, gamma, n_components, random_state,
+ random_weights, random_offset, id_col, val_col,
+ orig_data, **kwargs):
+ self.kernel_func = 'gaussian'
+ self.gamma = gamma
+ self.n_components = n_components
+ # int32 seed used by boost::minstd_rand
+ self.random_state = random_state
+ # random operators
+ self.rd_weights = random_weights
+ self.rd_offset = random_offset
+ # val column in random operators
+ self.rd_val = val_col
+ # id column in random operators
+ self.rd_id = id_col
+ self.transformed_table = dict()
+ self.original_table = dict()
+ # indicate whether rd_weights and rd_offset is view or table
+ # store the original data table name if they are view
+ # None if they are table
+ self.orig_data = orig_data
+
+ def clear(self):
+ data_type = 'view' if self.orig_data else 'table'
+ if self.rd_weights:
+ plpy.execute("drop {data_type} if exists {data};".format(
+ data=self.rd_weights,
+ data_type=data_type))
+ if self.rd_offset:
+ plpy.execute("drop {data_type} if exists {data};".format(
+ data=self.rd_offset,
+ data_type=data_type))
+
+ def __del__(self):
+ self.clear()
+
+ def saveAs(self, name):
+ if self.orig_data:
+ plpy.warning("Gaussian Kernel Warning: no need to save."
+ "Original data table exists: {0}"
+ .format(self.orig_data))
+ return
+
+ run_sql = """
+ create table {name} as
+ select
+ {id} as id, {val} as val,
+ 'offsets' as desp
+ from {rd_offset}
+ union
+ select
+ {id} as id, {val} as val,
+ 'weights' as desp
+ from {rd_weights}
+ """.format(name=name,
+ id=self.rd_id,
+ val=self.rd_val,
+ rd_offset=self.rd_offset,
+ rd_weights=self.rd_weights)
+ plpy.execute(run_sql)
+
+ @classmethod
+ def parse_params(cls, kernel_params='', n_features=10):
+ params_default = {
+ 'in_memory': 1,
+ 'gamma': 1/n_features,
+ 'random_state': 1,
+ 'n_components': 2*n_features}
+ params_types = {
+ 'in_memory': int,
+ 'gamma': float,
+ 'random_state': int,
+ 'n_components': int}
+ return extract_keyvalue_params(kernel_params,
+ params_types,
+ params_default)
+
+ @classmethod
+ def create(cls, schema_madlib, n_features, kernel_params):
+ params = cls.parse_params(kernel_params, n_features)
+ in_memory = params.pop('in_memory', True)
+ # according to the 1gb limit on each entry of the table
+ nelems = params['n_components']*n_features
+ if in_memory and nelems <= 1e8:
+ return GaussianKernelInMemory(schema_madlib, **params)
+ else:
+ return GaussianKernel(schema_madlib, **params)
+
+ @classmethod
+ def loadFrom(cls, schema_madlib, data, kernel_params=''):
+ rd_weights = unique_string(desp='random_weights')
+ rd_offset = unique_string(desp='random_offsets')
+ rd_val = unique_string(desp='val')
+ rd_id = unique_string(desp='id')
+ plpy.execute("""
+ drop view if exists {rd_weights};
+ create temp view {rd_weights} as
+ select id as {rd_id}, val as {rd_val} from {data}
+ where desp = 'weights';
+
+ drop view if exists {rd_offset};
+ create temp view {rd_offset} as
+ select id as {rd_id}, val as {rd_val} from {data}
+ where desp = 'offsets';
+ """.format(**locals()))
+ params = cls.parse_params(kernel_params)
+ in_memory = params.pop('in_memory', True)
--- End diff --
I would still recommend get() and add **kwargs in the derived class
__init__(). Any time we use a **params in the caller, it's best to add a
**kwargs in the definition. That way it won't fail if someone later adds
another parameter in parse_params() and does not update the derived classes.
---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at [email protected] or file a JIRA ticket
with INFRA.
---