This is an automated email from the ASF dual-hosted git repository. khannaekta pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/madlib.git
The following commit(s) were added to refs/heads/master by this push: new 5a1717e MADLIB-1351 : Added stopping criteria on perplexity to LDA 5a1717e is described below commit 5a1717ee0f40084e904d07e13e8fd7ea1162358a Author: Himanshu Pandey <hpan...@pivotal.io> AuthorDate: Tue Aug 27 11:10:07 2019 -0700 MADLIB-1351 : Added stopping criteria on perplexity to LDA Prior to this commit, in LDA there are no stopping criteria. It runs for all the provided iterations. This commit calculates the perplexity on each iteration and when the difference between the last two perplexity values is less than the perplexity_tol, it stops the iteration. These are the two new parameters added to the function: ``` evaluate_every INTEGER, perplexity_tol DOUBLE PRECISION ``` Also, there is a change to the model output table. The following new columns are added: 1. perplexity(DOUBLE PRECISION[]): is an array of perplexity values as per the 'evaluate_every' parameter. 2. perplexity_iters(INTEGER[]): is an Array indicating the iterations for which perplexity is calculated --- src/ports/postgres/modules/lda/lda.py_in | 141 +++++++++++-- src/ports/postgres/modules/lda/lda.sql_in | 161 +++++++++++++- src/ports/postgres/modules/lda/test/lda.sql_in | 277 +++++++++++++++++++++++++ 3 files changed, 546 insertions(+), 33 deletions(-) diff --git a/src/ports/postgres/modules/lda/lda.py_in b/src/ports/postgres/modules/lda/lda.py_in index 303afee..29c9d85 100644 --- a/src/ports/postgres/modules/lda/lda.py_in +++ b/src/ports/postgres/modules/lda/lda.py_in @@ -18,6 +18,7 @@ from utilities.control import HashaggControl from utilities.utilities import __mad_version, _assert, warn from utilities.validate_args import output_tbl_valid from utilities.validate_args import input_tbl_valid +from utilities.utilities import py_list_to_sql_string # use mad_vec to process arrays passed as strings in GPDB < 4.1 and PG < 9.0 version_wrapper = __mad_version() @@ -33,7 +34,7 @@ class LDATrainer: def __init__(self, schema_madlib, data_table, model_table, output_data_table, voc_size, topic_num, - iter_num, alpha, beta): + iter_num, alpha, beta, evaluate_every, perplexity_tol ): self.schema_madlib = schema_madlib self.data_table = data_table self.voc_size = voc_size @@ -45,6 +46,13 @@ class LDATrainer: self.output_data_table = output_data_table self.work_table_0 = '__work_table_train_0__' self.work_table_1 = '__work_table_train_1__' + self.evaluate_every = evaluate_every + self.perplexity_tol = perplexity_tol + self.perplexity = [] + self.perplexity_diff = self.perplexity_tol + self.perplexity_iters = [] + self.tol_reached = False + self.num_iterations = 0 plpy.execute("DROP TABLE IF EXISTS " + self.work_table_0) plpy.execute(""" @@ -80,7 +88,10 @@ class LDATrainer: topic_num INT4, alpha FLOAT8, beta FLOAT8, - model INT8[] + model INT8[], + num_iterations INT, + perplexity DOUBLE PRECISION[], + perplexity_iters INTEGER[] ) m4_ifdef(`__POSTGRESQL__', `', `WITH (APPENDONLY=TRUE) @@ -135,24 +146,23 @@ class LDATrainer: # iteration to sycn up output table and model table self.update_model_table(work_table_final) - # Update output table - plpy.execute("TRUNCATE TABLE " + self.output_data_table) - plpy.execute(""" - INSERT INTO {output_data_table} - SELECT - docid, wordcount, words, counts, doc_topic[1:{topic_num}] topic_count, - doc_topic[{topic_num} + 1:array_upper(doc_topic,1)] topic_assignment - FROM - {work_table_final} - """.format(output_data_table=self.output_data_table, - topic_num=self.topic_num, - work_table_final=work_table_final)) - # etime = time.time() - # plpy.notice('\t\t\ttime elapsed: %.2f seconds' % (etime - stime)) + self.gen_output_data_table(work_table_final) + + # JIRA: MADLIB-1351 + # Calculate Perplexity after the final update of + # the Model and Output Table + if self.evaluate_every > 0 and not self.tol_reached: + self.perplexity.append( + get_perplexity(self.schema_madlib, + self.model_table, + self.output_data_table)) + # Need to update Model Table one more time to update + # last calculated value of perplexity to it + self.update_model_table(work_table_final) + def iteration(self, it): # stime = time.time() - work_table_in = self.work_table_0 work_table_out = self.work_table_1 if it % 2 == 0: @@ -191,7 +201,21 @@ class LDATrainer: # etime = time.time() # plpy.notice('\t\ttime elapsed: %.2f seconds' % (etime - stime)) + self.calculatePerplexity(it, work_table_in) + + def update_model_table(self, work_table_in): + + # JIRA: MADLIB-1351 + # Create a string based on the value of self.perplexity + perplexity_values = "" + perplexity_iterations = "" + if len(self.perplexity) >= 1: + perplexity_values = ", {0}".format(py_list_to_sql_string(self.perplexity)) + perplexity_iterations = ", {0}".format(py_list_to_sql_string(self.perplexity_iters)) + n_iterations=", {0}".format(self.num_iterations) + + plpy.execute('TRUNCATE TABLE ' + self.model_table) if version_wrapper.is_gp43(): with OptimizerControl(True): @@ -209,14 +233,18 @@ class LDATrainer: {voc_size}, {topic_num} ) AS model + {n_iterations}{perplexity_values} {perplexity_iterations} FROM {work_table_in} """.format(model_table=self.model_table, topic_num=self.topic_num, voc_size=self.voc_size, alpha=self.alpha, beta=self.beta, + perplexity_values=perplexity_values, + perplexity_iterations=perplexity_iterations, schema_madlib=self.schema_madlib, - work_table_in=work_table_in)) + work_table_in=work_table_in, + n_iterations=n_iterations)) else: # work around insertion memory error (MPP-25561) # by copying the model to Python temporarily @@ -239,31 +267,90 @@ class LDATrainer: INSERT INTO {model_table} SELECT {voc_size}, {topic_num}, {alpha}, {beta}, $1 + {n_iterations}{perplexity_values} {perplexity_iterations} """.format(model_table=self.model_table, topic_num=self.topic_num, voc_size=self.voc_size, alpha=self.alpha, beta=self.beta, - schema_madlib=self.schema_madlib), + perplexity_values=perplexity_values, + perplexity_iterations=perplexity_iterations, + schema_madlib=self.schema_madlib, + n_iterations=n_iterations), ['bigint[]']) plpy.execute(plan, [model]) def run(self): # stime = time.time() # plpy.notice('start training process ...') - self.init_random() # sstime = time.time() for it in range(1, self.iter_num + 1): + # JIRA: MADLIB-1351 + # If the Perplexity_diff is less than the perplexity_tol, + # Stop the iteration + if self.perplexity_diff < self.perplexity_tol: + self.tol_reached = True + # When toll is reached before the number of iterations, + # Reduce the num_iterations by 1 since perplexity_iters + # Runs one iteration behind in this case. + self.num_iterations-=1; + break + + self.iteration(it) + self.num_iterations+=1; # eetime = time.time() # plpy.notice('\t\titeration done, time elapsed: %.2f seconds' % (eetime - sstime)) + + # JIRA: MADLIB-1351 + # Add the last iteration value to the array + if self.evaluate_every > 0 and not self.tol_reached: + self.perplexity_iters.append(self.iter_num) self.gen_final_data_tables() # etime = time.time() # plpy.notice('finished, time elapsed: %.2f seconds' % (etime - stime)) + # Update output table + def gen_output_data_table(self, work_table_final): + plpy.execute("TRUNCATE TABLE " + self.output_data_table) + plpy.execute(""" + INSERT INTO {output_data_table} + SELECT + docid, wordcount, words, counts, doc_topic[1:{topic_num}] topic_count, + doc_topic[{topic_num} + 1:array_upper(doc_topic,1)] topic_assignment + FROM + {work_table_final} + """.format(output_data_table=self.output_data_table, + topic_num=self.topic_num, + work_table_final=work_table_final)) + # etime = time.time() + # plpy.notice('\t\t\ttime elapsed: %.2f seconds' % (etime - stime)) + + + def calculatePerplexity(self,it, work_table_in): + # JIRA: MADLIB-1351 + # Calculate Perplexity for evaluate_every Iteration + # Skip the calculation at the first iteration + # For each iteration: + # Model table is updated (for the first iteration, it is the random model. For iteration >1 , the model that is # updated is learnt in the previous iteration) + # __lda_count_topic_agg is called then lda_gibbs_sample is called which learns and updates the model(the updated # model is not passed to python. The learnt model is updated in the next iteration) + # Because of this workflow we can safely ignore the first perplexity value. + + + if it > self.evaluate_every and self.evaluate_every > 0 and ( + it - 1) % self.evaluate_every == 0: + self.gen_output_data_table(work_table_in) + perplexity = get_perplexity(self.schema_madlib, + self.model_table, + self.output_data_table) + if len(self.perplexity) > 0: + self.perplexity_diff = abs(self.perplexity[-1] - perplexity) + self.perplexity_iters.append(it - 1) + self.perplexity.append(perplexity) + # ------------------------------------------------------------------------------ @@ -416,7 +503,7 @@ class LDAPredictor: def lda_train(schema_madlib, train_table, model_table, output_data_table, voc_size, - topic_num, iter_num, alpha, beta): + topic_num, iter_num, alpha, beta, evaluate_every, perplexity_tol): """ @brief This function provides the entry for the LDA training process. @param schema_madlib MDALib schema @@ -446,6 +533,16 @@ def lda_train(schema_madlib, train_table, model_table, output_data_table, voc_si _assert(beta is not None and beta > 0, 'invalid argument: positive real expected for beta') + # Setting the default values for perplexity_tol and evaluate_every + if perplexity_tol is None: + perplexity_tol = 0.1 + if evaluate_every is None: + evaluate_every = 0 + _assert(evaluate_every <= iter_num, + 'invalid argument: evaluate_every should not be greater than iter_num') + _assert(perplexity_tol is not None and perplexity_tol >= 0, + 'invalid argument: perplexity_tol should not be less than 0') + output_tbl_valid(model_table, 'LDA') output_tbl_valid(output_data_table, 'LDA') @@ -460,7 +557,7 @@ def lda_train(schema_madlib, train_table, model_table, output_data_table, voc_si convt_table = _convert_data_table(schema_madlib, train_table) lt = LDATrainer(schema_madlib, convt_table, model_table, output_data_table, voc_size, topic_num, - iter_num, alpha, beta) + iter_num, alpha, beta,evaluate_every, perplexity_tol) lt.run() diff --git a/src/ports/postgres/modules/lda/lda.sql_in b/src/ports/postgres/modules/lda/lda.sql_in index 16dc3a8..36e8106 100644 --- a/src/ports/postgres/modules/lda/lda.sql_in +++ b/src/ports/postgres/modules/lda/lda.sql_in @@ -62,12 +62,12 @@ The following generative process then defines a distribution over a corpus of documents: - Sample for each topic \f$ i \f$, a per-topic word -distribution \f$ \phi_i \f$ from the Dirichlet(\f$\beta\f$) prior. +distribution \f$ \phi_i \f$ from the Dirichlet (\f$\beta\f$) prior. - For each document: - Sample a document length N from a suitable distribution, say, Poisson. - Sample a topic mixture \f$ \theta \f$ for the document from the -Dirichlet(\f$\alpha\f$) distribution. +Dirichlet (\f$\alpha\f$) distribution. - For each of the N words: - Sample a topic \f$ z_n \f$ from the multinomial topic distribution \f$ \theta \f$. @@ -99,7 +99,9 @@ lda_train( data_table, topic_num, iter_num, alpha, - beta + beta, + evaluate_every, + perplexity_tol ) </pre> \b Arguments @@ -150,6 +152,28 @@ lda_train( data_table, <th>model</th> <td>BIGINT[]. The encoded model description (not human readable).</td> </tr> + <tr> + <th>num_iterations</th> + <td>INTEGER. Number of iterations that training ran for, which may + be less than the maximum value specified in the parameter 'iter_num' + if the perplexity tolerance was reached.</td> + </tr> + <tr> + <th>perplexity</th> + <td>DOUBLE PRECISION[]. Array of perplexity values as per the 'evaluate_every' parameter. + For example, if 'evaluate_every=5' this would be an array of perplexity values for + every 5th iteration, plus the last iteration.</td> + </tr> + <tr> + <th>perplexity_iters</th> + <td>INTEGER[]. Array indicating the iterations for which perplexity is calculated, as derived + from the parameters 'iter_num' and 'evaluate_every'. For example, if 'iter_num=5' and 'evaluate_every=2', + then 'perplexity_iters' value would be {2,4,5} indicating that perplexity is computed at + iterations 2, 4 and 5 (at the end), unless of course it terminated earlier due + to 'perplexity_tol'. If 'iter_num=5' and 'evaluate_every=1', then 'perplexity_iters' value + would be {1,2,3,4,5} indicating that perplexity is computed at every iteration, + again assuming it ran the full number of iterations.</td> + </tr> </table> </dd> <dt>output_data_table</dt> @@ -204,14 +228,24 @@ lda_train( data_table, <dt>topic_num</dt> <dd>INTEGER. Desired number of topics.</dd> <dt>iter_num</dt> - <dd>INTEGER. Desired number of iterations.</dd> + <dd>INTEGER. Maximum number of iterations. If a 'perplexity_tol' is set, + LDA may train for less than the maximum number of iterations if the tolerance is reached.</dd> <dt>alpha</dt> <dd>DOUBLE PRECISION. Dirichlet prior for the per-document topic multinomial (e.g., 50/topic_num is a reasonable value to start with - as per Griffiths and Steyvers [2] ).</dd> + as per Griffiths and Steyvers [2]).</dd> <dt>beta</dt> <dd>DOUBLE PRECISION. Dirichlet prior for the per-topic word multinomial (e.g., 0.01 is a reasonable value to start with).</dd> + <dt>evaluate_every (optional)</dt> + <dd>INTEGER, default: 0. How often to evaluate perplexity. Set it to 0 or a negative number + to not evaluate perplexity in training at all. Evaluating perplexity can help you check + convergence during the training process, but it will also increase total training time. + For example, evaluating perplexity in every iteration might increase training time + up to two-fold.</dd> + <dt>perplexity_tol (optional)</dt> + <dd>DOUBLE PRECISION, default: 0.1. Perplexity tolerance to stop iteration. + Only used when the parameter 'evaluate_every' is greater than 0.</dd> </dl> @anchor predict @@ -246,11 +280,11 @@ lda_predict( data_table, @anchor perplexity @par Perplexity Perplexity describes how well the model fits the data by -computing word likelihoods averaged over the test documents. +computing word likelihoods averaged over the documents. This function returns a single perplexity value. <pre class="syntax"> lda_get_perplexity( model_table, - output_predict_table + output_data_table ); </pre> \b Arguments @@ -258,9 +292,10 @@ lda_get_perplexity( model_table, <dt>model_table</dt> <dd>TEXT. The model table generated by the training process. </dd> -<dt>output_predict_table</dt> - <dd>TEXT. The prediction output table generated by the - predict function above. +<dt>output_data_table</dt> + <dd>TEXT. Output table generated by the + training or predict functions, containing the topic assignments + by word. </dd> </dl> @@ -609,6 +644,20 @@ counts | {1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,2,11,1,1,2,1,1,3,1,1,1,1,1,1,1 topic_count | {5,5,26,5,8} topic_assignment | {4,4,4,0,2,0,0,2,4,4,2,2,2,1,2,4,1,0,2,2,2,2,2,2,2,2,2,2,2,1,2,2,2,2,4,3,3,3,2,3,2,3,2,1,4,2,2,1,0} </pre> +Review summary table: +<pre class="example"> +SELECT voc_size, topic_num, alpha, beta, num_iterations, perplexity, perplexity_iters from lda_model; +</pre> +<pre class="result"> +-[ RECORD 1 ]----+----- +voc_size | 103 +topic_num | 5 +alpha | 5 +beta | 0.01 +num_iterations | 10 +perplexity | +perplexity_iters | +</pre> -# Review learned model using helper functions. First, we get topic description by top-k words. These are @@ -897,6 +946,48 @@ SELECT madlib.lda_get_perplexity( 'lda_model', -- LDA model from training (1 row) </pre> +-# Perplexity by iteration. Now let's look at how perplexity +changes from one iteration to the next: +<pre class="example"> +DROP TABLE IF EXISTS lda_model_perp, lda_output_data_perp; +SELECT madlib.lda_train( 'documents_tf', -- documents table in the form of term frequency + 'lda_model_perp', -- model table created by LDA training (not human readable) + 'lda_output_data_perp', -- readable output data table + 103, -- vocabulary size + 5, -- number of topics + 30, -- number of iterations + 5, -- Dirichlet prior for the per-doc topic multinomial (alpha) + 0.01, -- Dirichlet prior for the per-topic word multinomial (beta) + 2, -- Evaluate perplexity every n iterations + 0.3 -- Tolerance to stop iteration + ); +SELECT voc_size, topic_num, alpha, beta, num_iterations, perplexity, perplexity_iters from lda_model_perp; +</pre> +<pre class="result"> +-[ RECORD 1 ]----+---------------------------------------------------------------------------------------------------- +voc_size | 103 +topic_num | 5 +alpha | 5 +beta | 0.01 +num_iterations | 14 +perplexity | {70.0297335165,65.6497887327,70.2040806534,68.2594871716,70.3816093812,67.9193935299,67.6325562682} +perplexity_iters | {2,4,6,8,10,12,14} +</pre> +Iterating stops at 14 since the tolerance is reached. There are 7 +perplexity values because we computed it only every 2nd iteration to save time. +As expected, the perplexity +on the training data is that same as the final iteration value: +<pre class="example"> +SELECT madlib.lda_get_perplexity( 'lda_model_perp', + 'lda_output_data_perp' + ); +</pre> +<pre class="result"> + lda_get_perplexity +--------------------+ + 67.632556268157 +</pre> + @anchor literature @literature @@ -967,7 +1058,55 @@ RETURNS SETOF MADLIB_SCHEMA.lda_result AS $$ PythonFunctionBodyOnly(`lda', `lda') with AOControl(False): lda.lda_train(schema_madlib, data_table, model_table, output_data_table, - voc_size, topic_num, iter_num, alpha, beta) + voc_size, topic_num, iter_num, alpha, beta, None, None) + return [[model_table, 'model table'], + [output_data_table, 'output data table']] +$$ LANGUAGE plpythonu +m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); + + +CREATE OR REPLACE FUNCTION +MADLIB_SCHEMA.lda_train +( + data_table TEXT, + model_table TEXT, + output_data_table TEXT, + voc_size INT4, + topic_num INT4, + iter_num INT4, + alpha FLOAT8, + beta FLOAT8, + evaluate_every INT4, + perplexity_tol FLOAT8 +) +RETURNS SETOF MADLIB_SCHEMA.lda_result AS $$ + PythonFunctionBodyOnly(`lda', `lda') + with AOControl(False): + lda.lda_train(schema_madlib, data_table, model_table, output_data_table, + voc_size, topic_num, iter_num, alpha, beta,evaluate_every , perplexity_tol) + return [[model_table, 'model table'], + [output_data_table, 'output data table']] +$$ LANGUAGE plpythonu +m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); + +CREATE OR REPLACE FUNCTION +MADLIB_SCHEMA.lda_train +( + data_table TEXT, + model_table TEXT, + output_data_table TEXT, + voc_size INT4, + topic_num INT4, + iter_num INT4, + alpha FLOAT8, + beta FLOAT8, + evaluate_every INT4 +) +RETURNS SETOF MADLIB_SCHEMA.lda_result AS $$ + PythonFunctionBodyOnly(`lda', `lda') + with AOControl(False): + lda.lda_train(schema_madlib, data_table, model_table, output_data_table, + voc_size, topic_num, iter_num, alpha, beta,evaluate_every , None) return [[model_table, 'model table'], [output_data_table, 'output data table']] $$ LANGUAGE plpythonu diff --git a/src/ports/postgres/modules/lda/test/lda.sql_in b/src/ports/postgres/modules/lda/test/lda.sql_in index c230240..96d5f5f 100644 --- a/src/ports/postgres/modules/lda/test/lda.sql_in +++ b/src/ports/postgres/modules/lda/test/lda.sql_in @@ -288,3 +288,280 @@ CREATE OR REPLACE FUNCTION validate_lda_output() RETURNS integer AS $$ $$ LANGUAGE plpgsql; select validate_lda_output(); + + +---------- TEST CASES FOR PERPLEXITY ---------- + +drop table if exists lda_model, lda_output_data; +SELECT lda_train( + 'lda_training', -- data_table + 'lda_model', -- model_table + 'lda_output_data', -- output_data_table + 20, -- voc_size + 5, -- topic_num + 2, -- iter_num + 10, -- alpha + 0.01, -- beta + 2, -- evaluate_every + .2); -- perplexity_tol + +SELECT assert(perplexity_iters = '{2}', 'Number of Perplexity iterations are wrong') FROM lda_model; +SELECT assert(perplexity[1] > 0 , 'Perplexity value should be greate than 0') FROM lda_model ; +-- Commenting the below flaky test to re-visit later. +-- select assert(array_upper(ARRAY(Select distinct unnest(perplexity)),1)= array_upper(perplexity,1) , 'Perplexity values should be unique') from lda_model ; + + +drop table if exists lda_model, lda_output_data; +SELECT lda_train( + 'lda_training', -- data_table + 'lda_model', -- model_table + 'lda_output_data', -- output_data_table + 20, -- voc_size + 5, -- topic_num + 3, -- iter_num + 10, -- alpha + 0.01, -- beta + 1, -- evaluate_every + .1 -- perplexity_tol + ); + +SELECT assert(array_upper(perplexity,1) = 3, 'Perplexity calculation is wrong') FROM lda_model; +SELECT assert(perplexity[1] > 0 , 'Perplexity value should be greate than 0') FROM lda_model ; +-- Commenting the below flaky test to re-visit later. +-- select assert(array_upper(ARRAY(Select distinct unnest(perplexity)),1)= array_upper(perplexity,1) , 'Perplexity values should be unique') from lda_model ; + + +-- Function to check if the perplexity value returned from the function +-- and calculated by the train funcion are same. +CREATE OR REPLACE FUNCTION validate_perplexity() RETURNS boolean AS $$ + + DECLARE + perplexity_from_func Double precision[]; + perplexity_lda_train Double precision[]; + + BEGIN + drop table if exists lda_model, lda_output_data; + PERFORM lda_train( + 'lda_training', + 'lda_model', + 'lda_output_data', + 20, 5, 2, 10, 0.01, 2, .2); + + SELECT array_agg(round(lda_get_perplexity::numeric,10)) INTO perplexity_from_func from lda_get_perplexity('lda_model','lda_output_data'); + + select perplexity INTO perplexity_lda_train from lda_model ; + + + if perplexity_lda_train != perplexity_from_func THEN + return FALSE; + ELSE + return TRUE; + END IF; + + END; + +$$ LANGUAGE plpgsql; + +SELECT assert(validate_perplexity() = TRUE, 'Perplexity calculation is wrong'); +SELECT assert(perplexity[1] > 0 , 'Perplexity value should be greate than 0') FROM lda_model ; +-- Commenting the below flaky test to re-visit later. +-- select assert(array_upper(ARRAY(Select distinct unnest(perplexity)),1)= array_upper(perplexity,1) , 'Perplexity values should be unique') from lda_model ; + +-- Test for evaluate_every = Number of iterations = 1. It should give exactly one perplexity value -- + + +drop table if exists lda_model, lda_output_data; +SELECT lda_train( + 'lda_training', -- data_table + 'lda_model', -- model_table + 'lda_output_data', -- output_data_table + 20, -- voc_size + 5, -- topic_num + 1, -- iter_num + 10, -- alpha + 0.01, -- beta + 1, -- evaluate_every + .1 -- perplexity_tol + ); + +select assert(perplexity != '{}', 'Perplexity should be calculated') from lda_model; +select assert(array_upper(perplexity,1) = 1, 'Perplexity should not have more than 1 value') from lda_model; + + +-- Test for evaluate_every = 0 and -1 : In this do not calculate perplexity-- + + + +drop table if exists lda_model, lda_output_data; +SELECT lda_train( + 'lda_training', -- data_table + 'lda_model', -- model_table + 'lda_output_data', -- output_data_table + 20, -- voc_size + 5, -- topic_num + 1, -- iter_num + 10, -- alpha + 0.01, -- beta + 0, -- evaluate_every + .1 -- perplexity_tol + ); + +select assert(perplexity = '{}', 'Perplexity should not be calculated') from lda_model; +select assert(perplexity_iters = '{}', 'Perplexity iterations should be null') from lda_model ; + +-- Test for evaluate_every = 0 and -1 : In this do not calculate perplexity-- + + + +drop table if exists lda_model, lda_output_data; +SELECT lda_train( + 'lda_training', -- data_table + 'lda_model', -- model_table + 'lda_output_data', -- output_data_table + 20, -- voc_size + 5, -- topic_num + 1, -- iter_num + 10, -- alpha + 0.01, -- beta + -1, -- evaluate_every + .1 -- perplexity_tol + ); + +select assert(perplexity = '{}', 'Perplexity should not be calculated') from lda_model ; +select assert(perplexity_iters = '{}', 'Perplexity iterations should be null') from lda_model ; + + +-- Test to check if the perplexity_iters are matching the expected value -- + +drop table if exists lda_model, lda_output_data; +SELECT lda_train( + 'lda_training', -- data_table + 'lda_model', -- model_table + 'lda_output_data', -- output_data_table + 20, -- voc_size + 5, -- topic_num + 10, -- iter_num + 10, -- alpha + 0.01, -- beta + 2, -- evaluate_every + .1 -- perplexity_tol + ); + +SELECT assert(array_upper(perplexity_iters,1) <= 5, 'Perplexity iterations are different from expected') FROM lda_model ; +SELECT assert(perplexity[1] > 0 , 'Perplexity value should be greate than 0') FROM lda_model ; +-- Commenting the below flaky test to re-visit later. +-- select assert(array_upper(ARRAY(Select distinct unnest(perplexity)),1)= array_upper(perplexity,1) , 'Perplexity values should be unique') from lda_model ; + + +-- Test: If the difference between any two iterations is less than the perplexity_tol, we will stop the training. -- +-- In this case, it will iterate to two iterations only as the perplexity_tol is very large and the difference between the 2 and 4th iteration -- +-- will be less than 10, so Only 2 iterations will be recorded -- + + +drop table if exists lda_model, lda_output_data; +SELECT lda_train( + 'lda_training', -- data_table + 'lda_model', -- model_table + 'lda_output_data', -- output_data_table + 20, -- voc_size + 5, -- topic_num + 10, -- iter_num + 10, -- alpha + 0.01, -- beta + 2, -- evaluate_every + 100 -- perplexity_tol + ); + + +SELECT assert(abs(perplexity[2] - perplexity[1]) <100, 'Perplexity tol is less than the perplexity difference') FROM lda_model ; +SELECT assert(array_upper(perplexity_iters,1) = 2, 'Perplexity iterations are different from expected') FROM lda_model ; +SELECT assert(perplexity[1] > 0 , 'Perplexity value should be greate than 0') FROM lda_model ; +-- Commenting the below flaky test to re-visit later. +-- select assert(array_upper(ARRAY(Select distinct unnest(perplexity)),1)= array_upper(perplexity,1) , 'Perplexity values should be unique') from lda_model ; + + +-- Test for evaluate_every = 1 and 0 : In this case the iterations should not stop early -- + + + +drop table if exists lda_model, lda_output_data; +SELECT lda_train( + 'lda_training', -- data_table + 'lda_model', -- model_table + 'lda_output_data', -- output_data_table + 20, -- voc_size + 5, -- topic_num + 10, -- iter_num + 10, -- alpha + 0.01, -- beta + 1, -- evaluate_every + 0 -- perplexity_tol + ); + +select assert(num_iterations = 10, 'Perplexity should run for all the iterations') from lda_model ; + + +-- Test for evaluate_every = NULL and perplexity_tol = NULL. In this case it should not calculate perplexity -- + + +drop table if exists lda_model, lda_output_data; +SELECT lda_train( + 'lda_training', -- data_table + 'lda_model', -- model_table + 'lda_output_data', -- output_data_table + 20, -- voc_size + 5, -- topic_num + 10, -- iter_num + 10, -- alpha + 0.01, -- beta + NULL, -- evaluate_every + NULL -- perplexity_tol + ); + +select assert(perplexity = '{}', 'Perplexity should not be calculated') from lda_model ; +select assert(perplexity_iters = '{}', 'Perplexity iterations should be null') from lda_model ; + + +-- Test for evaluate_every = 1 and perplexity_tol = NULL. -- +-- In this case it should calculate perplexity with perplexity_tol = 0.1 as default value -- + + +drop table if exists lda_model, lda_output_data; +SELECT lda_train( + 'lda_training', -- data_table + 'lda_model', -- model_table + 'lda_output_data', -- output_data_table + 20, -- voc_size + 5, -- topic_num + 10, -- iter_num + 10, -- alpha + 0.01, -- beta + 1, -- evaluate_every + NULL -- perplexity_tol + ); + +select assert(array_upper(perplexity_iters,1) >= 1, 'Perplexity iterations are different from expected') from lda_model ; +select assert(perplexity != '{}', 'Perplexity should be calculated') from lda_model; + + +-- Test for evaluate_every = NULL and perplexity_tol != NULL -- +-- In this case it should not calculate perplexity -- + + +drop table if exists lda_model, lda_output_data; +SELECT lda_train( + 'lda_training', -- data_table + 'lda_model', -- model_table + 'lda_output_data', -- output_data_table + 20, -- voc_size + 5, -- topic_num + 10, -- iter_num + 10, -- alpha + 0.01, -- beta + NULL, -- evaluate_every + 1 -- perplexity_tol + ); + +select assert(perplexity = '{}', 'Perplexity should not be calculated') from lda_model ; +select assert(perplexity_iters = '{}', 'Perplexity iterations should be null') from lda_model ; +