[madlib] branch master updated: MADLIB-1351 : Added stopping criteria on perplexity to LDA

khannaekta Mon, 18 Nov 2019 12:35:25 -0800

This is an automated email from the ASF dual-hosted git repository.

khannaekta pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/madlib.git



The following commit(s) were added to refs/heads/master by this push:
     new 5a1717e  MADLIB-1351 : Added stopping criteria on perplexity to LDA
5a1717e is described below

commit 5a1717ee0f40084e904d07e13e8fd7ea1162358a
Author: Himanshu Pandey <hpan...@pivotal.io>
AuthorDate: Tue Aug 27 11:10:07 2019 -0700

    MADLIB-1351 : Added stopping criteria on perplexity to LDA
    
    Prior to this commit, in LDA there are no stopping criteria. It runs for
    all the provided iterations. This commit calculates the perplexity on
    each iteration and when the difference between the last two perplexity
    values is less than the perplexity_tol, it stops the iteration.
    
    These are the two new parameters added to the function:
    
    ```
    evaluate_every  INTEGER,
    perplexity_tol  DOUBLE PRECISION
    ```
    
    Also, there is a change to the model output table. The following new
    columns are added:
    
    1. perplexity(DOUBLE PRECISION[]): is an array of perplexity values as
    per the 'evaluate_every' parameter.
    2. perplexity_iters(INTEGER[]): is an Array indicating the iterations
    for which perplexity is calculated
---
 src/ports/postgres/modules/lda/lda.py_in       | 141 +++++++++++--
 src/ports/postgres/modules/lda/lda.sql_in      | 161 +++++++++++++-
 src/ports/postgres/modules/lda/test/lda.sql_in | 277 +++++++++++++++++++++++++
 3 files changed, 546 insertions(+), 33 deletions(-)

diff --git a/src/ports/postgres/modules/lda/lda.py_in 
b/src/ports/postgres/modules/lda/lda.py_in
index 303afee..29c9d85 100644
--- a/src/ports/postgres/modules/lda/lda.py_in
+++ b/src/ports/postgres/modules/lda/lda.py_in
@@ -18,6 +18,7 @@ from utilities.control import HashaggControl
 from utilities.utilities import __mad_version, _assert, warn
 from utilities.validate_args import output_tbl_valid
 from utilities.validate_args import input_tbl_valid
+from utilities.utilities import py_list_to_sql_string
 
 # use mad_vec to process arrays passed as strings in GPDB < 4.1 and PG < 9.0
 version_wrapper = __mad_version()
@@ -33,7 +34,7 @@ class LDATrainer:
 
     def __init__(self, schema_madlib, data_table, model_table,
                  output_data_table, voc_size, topic_num,
-                 iter_num, alpha, beta):
+                 iter_num, alpha, beta, evaluate_every, perplexity_tol ):
         self.schema_madlib = schema_madlib
         self.data_table = data_table
         self.voc_size = voc_size
@@ -45,6 +46,13 @@ class LDATrainer:
         self.output_data_table = output_data_table
         self.work_table_0 = '__work_table_train_0__'
         self.work_table_1 = '__work_table_train_1__'
+        self.evaluate_every = evaluate_every
+        self.perplexity_tol = perplexity_tol
+        self.perplexity = []
+        self.perplexity_diff = self.perplexity_tol
+        self.perplexity_iters = []
+        self.tol_reached = False
+        self.num_iterations = 0
 
         plpy.execute("DROP TABLE IF EXISTS " + self.work_table_0)
         plpy.execute("""
@@ -80,7 +88,10 @@ class LDATrainer:
                 topic_num   INT4,
                 alpha       FLOAT8,
                 beta        FLOAT8,
-                model       INT8[]
+                model       INT8[],
+                num_iterations INT,
+                perplexity  DOUBLE PRECISION[],
+                perplexity_iters INTEGER[]
                 )
                 m4_ifdef(`__POSTGRESQL__', `',
                     `WITH (APPENDONLY=TRUE)
@@ -135,24 +146,23 @@ class LDATrainer:
         # iteration to sycn up output table and model table
         self.update_model_table(work_table_final)
 
-        # Update output table
-        plpy.execute("TRUNCATE TABLE " + self.output_data_table)
-        plpy.execute("""
-            INSERT INTO {output_data_table}
-            SELECT
-                docid, wordcount, words, counts, doc_topic[1:{topic_num}] 
topic_count,
-                doc_topic[{topic_num} + 1:array_upper(doc_topic,1)] 
topic_assignment
-            FROM
-                {work_table_final}
-            """.format(output_data_table=self.output_data_table,
-                       topic_num=self.topic_num,
-                       work_table_final=work_table_final))
-        # etime = time.time()
-        # plpy.notice('\t\t\ttime elapsed: %.2f seconds' % (etime - stime))
+        self.gen_output_data_table(work_table_final)
+
+        # JIRA: MADLIB-1351
+        # Calculate Perplexity after the final update of
+        # the Model and Output Table
+        if self.evaluate_every > 0 and not self.tol_reached:
+            self.perplexity.append(
+                get_perplexity(self.schema_madlib,
+                               self.model_table,
+                               self.output_data_table))
+            # Need to update Model Table one more time to update
+            # last calculated value of perplexity to it
+            self.update_model_table(work_table_final)
+
 
     def iteration(self, it):
         # stime = time.time()
-
         work_table_in = self.work_table_0
         work_table_out = self.work_table_1
         if it % 2 == 0:
@@ -191,7 +201,21 @@ class LDATrainer:
         # etime = time.time()
         # plpy.notice('\t\ttime elapsed: %.2f seconds' % (etime - stime))
 
+        self.calculatePerplexity(it, work_table_in)
+        
+
     def update_model_table(self, work_table_in):
+
+        # JIRA: MADLIB-1351
+        # Create a string based on the value of self.perplexity
+        perplexity_values = ""
+        perplexity_iterations = ""
+        if len(self.perplexity) >= 1:
+            perplexity_values = ", 
{0}".format(py_list_to_sql_string(self.perplexity))
+            perplexity_iterations = ", 
{0}".format(py_list_to_sql_string(self.perplexity_iters))
+        n_iterations=", {0}".format(self.num_iterations)
+
+
         plpy.execute('TRUNCATE TABLE ' + self.model_table)
         if version_wrapper.is_gp43():
             with OptimizerControl(True):
@@ -209,14 +233,18 @@ class LDATrainer:
                             {voc_size},
                             {topic_num}
                         ) AS model
+                        {n_iterations}{perplexity_values} 
{perplexity_iterations}
                     FROM {work_table_in}
                     """.format(model_table=self.model_table,
                                topic_num=self.topic_num,
                                voc_size=self.voc_size,
                                alpha=self.alpha,
                                beta=self.beta,
+                               perplexity_values=perplexity_values,
+                               perplexity_iterations=perplexity_iterations,
                                schema_madlib=self.schema_madlib,
-                               work_table_in=work_table_in))
+                               work_table_in=work_table_in,
+                               n_iterations=n_iterations))
         else:
             # work around insertion memory error (MPP-25561)
             # by copying the model to Python temporarily
@@ -239,31 +267,90 @@ class LDATrainer:
                 INSERT INTO {model_table}
                 SELECT
                     {voc_size}, {topic_num}, {alpha}, {beta}, $1
+                    {n_iterations}{perplexity_values} {perplexity_iterations}
                 """.format(model_table=self.model_table,
                            topic_num=self.topic_num,
                            voc_size=self.voc_size,
                            alpha=self.alpha,
                            beta=self.beta,
-                           schema_madlib=self.schema_madlib),
+                           perplexity_values=perplexity_values,
+                           perplexity_iterations=perplexity_iterations,
+                           schema_madlib=self.schema_madlib,
+                           n_iterations=n_iterations),
                 ['bigint[]'])
             plpy.execute(plan, [model])
 
     def run(self):
         # stime = time.time()
         # plpy.notice('start training process ...')
-
         self.init_random()
         # sstime = time.time()
         for it in range(1, self.iter_num + 1):
+            # JIRA: MADLIB-1351
+            # If the Perplexity_diff is less than the perplexity_tol,
+            # Stop the iteration
+            if self.perplexity_diff < self.perplexity_tol:
+                self.tol_reached = True
+                # When toll is reached before the number of iterations, 
+                # Reduce the num_iterations by 1 since perplexity_iters
+                # Runs one iteration behind in this case. 
+                self.num_iterations-=1;
+                break
+
+
             self.iteration(it)
+            self.num_iterations+=1;
         # eetime = time.time()
         # plpy.notice('\t\titeration done, time elapsed: %.2f seconds' % 
(eetime - sstime))
 
+
+        # JIRA: MADLIB-1351
+        # Add the last iteration value to the array
+        if self.evaluate_every > 0 and not self.tol_reached:
+            self.perplexity_iters.append(self.iter_num)
         self.gen_final_data_tables()
 
         # etime = time.time()
         # plpy.notice('finished, time elapsed: %.2f seconds' % (etime - stime))
 
+    # Update output table
+    def gen_output_data_table(self, work_table_final):
+        plpy.execute("TRUNCATE TABLE " + self.output_data_table)
+        plpy.execute("""
+            INSERT INTO {output_data_table}
+            SELECT
+                docid, wordcount, words, counts, doc_topic[1:{topic_num}] 
topic_count,
+                doc_topic[{topic_num} + 1:array_upper(doc_topic,1)] 
topic_assignment
+            FROM
+                {work_table_final}
+            """.format(output_data_table=self.output_data_table,
+                       topic_num=self.topic_num,
+                       work_table_final=work_table_final))
+        # etime = time.time()
+        # plpy.notice('\t\t\ttime elapsed: %.2f seconds' % (etime - stime)) 
+
+
+    def calculatePerplexity(self,it, work_table_in):
+        # JIRA: MADLIB-1351
+        # Calculate Perplexity for evaluate_every Iteration
+        # Skip the calculation at the first iteration 
+        # For each iteration: 
+        # Model table is updated (for the first iteration, it is the random 
model. For iteration >1 , the model that is   # updated is learnt in the 
previous iteration)
+        # __lda_count_topic_agg is called then lda_gibbs_sample is called 
which learns and updates the model(the updated  # model is not passed to 
python. The learnt model is updated in the next iteration)
+        # Because of this workflow we can safely ignore the first perplexity 
value.
+        
+
+        if it > self.evaluate_every and self.evaluate_every > 0 and (
+                it - 1) % self.evaluate_every == 0:
+            self.gen_output_data_table(work_table_in)
+            perplexity = get_perplexity(self.schema_madlib,
+                                        self.model_table,
+                                        self.output_data_table)
+            if len(self.perplexity) > 0:
+                self.perplexity_diff = abs(self.perplexity[-1] - perplexity)
+            self.perplexity_iters.append(it - 1)
+            self.perplexity.append(perplexity)
+
 # 
------------------------------------------------------------------------------
 
 
@@ -416,7 +503,7 @@ class LDAPredictor:
 
 
 def lda_train(schema_madlib, train_table, model_table, output_data_table, 
voc_size,
-              topic_num, iter_num, alpha, beta):
+              topic_num, iter_num, alpha, beta, evaluate_every, 
perplexity_tol):
     """
     @brief This function provides the entry for the LDA training process.
     @param schema_madlib        MDALib schema
@@ -446,6 +533,16 @@ def lda_train(schema_madlib, train_table, model_table, 
output_data_table, voc_si
     _assert(beta is not None and beta > 0,
             'invalid argument: positive real expected for beta')
 
+    # Setting the default values for perplexity_tol and evaluate_every
+    if perplexity_tol is None:
+        perplexity_tol = 0.1
+    if evaluate_every is None:
+        evaluate_every = 0
+    _assert(evaluate_every <= iter_num,
+            'invalid argument: evaluate_every should not be greater than 
iter_num')
+    _assert(perplexity_tol is not None and perplexity_tol >= 0,
+            'invalid argument: perplexity_tol should not be less than 0')
+
     output_tbl_valid(model_table, 'LDA')
     output_tbl_valid(output_data_table, 'LDA')
 
@@ -460,7 +557,7 @@ def lda_train(schema_madlib, train_table, model_table, 
output_data_table, voc_si
     convt_table = _convert_data_table(schema_madlib, train_table)
     lt = LDATrainer(schema_madlib, convt_table, model_table,
                     output_data_table, voc_size, topic_num,
-                    iter_num, alpha, beta)
+                    iter_num, alpha, beta,evaluate_every, perplexity_tol)
 
     lt.run()
 
diff --git a/src/ports/postgres/modules/lda/lda.sql_in 
b/src/ports/postgres/modules/lda/lda.sql_in
index 16dc3a8..36e8106 100644
--- a/src/ports/postgres/modules/lda/lda.sql_in
+++ b/src/ports/postgres/modules/lda/lda.sql_in
@@ -62,12 +62,12 @@ The following generative process then defines a 
distribution over a corpus of
 documents:
 
 - Sample for each topic \f$ i \f$, a per-topic word
-distribution \f$ \phi_i \f$ from the Dirichlet(\f$\beta\f$) prior.
+distribution \f$ \phi_i \f$ from the Dirichlet (\f$\beta\f$) prior.
 
 - For each document:
     - Sample a document length N from a suitable distribution, say, Poisson.
     - Sample a topic mixture \f$ \theta \f$ for the document from the
-Dirichlet(\f$\alpha\f$) distribution.
+Dirichlet (\f$\alpha\f$) distribution.
     - For each of the N words:
         - Sample a topic \f$ z_n \f$ from the multinomial topic distribution 
\f$
    \theta \f$.
@@ -99,7 +99,9 @@ lda_train( data_table,
            topic_num,
            iter_num,
            alpha,
-           beta
+           beta,
+           evaluate_every,
+           perplexity_tol
          )
 </pre>
 \b Arguments
@@ -150,6 +152,28 @@ lda_train( data_table,
                 <th>model</th>
                 <td>BIGINT[]. The encoded model description (not human 
readable).</td>
             </tr>
+            <tr>
+                <th>num_iterations</th>
+                <td>INTEGER.  Number of iterations that training ran for, 
which may
+                be less than the maximum value specified in the parameter 
'iter_num'
+                if the perplexity tolerance was reached.</td>
+            </tr>
+            <tr>
+                <th>perplexity</th>
+                <td>DOUBLE PRECISION[].  Array of perplexity values as per the 
'evaluate_every' parameter.
+                For example, if 'evaluate_every=5' this would be an array of 
perplexity values for
+                every 5th iteration, plus the last iteration.</td>
+            </tr>
+            <tr>
+                <th>perplexity_iters</th>
+                <td>INTEGER[]. Array indicating the iterations for which 
perplexity is calculated, as derived
+                from the parameters 'iter_num' and 'evaluate_every'.  For 
example, if 'iter_num=5' and 'evaluate_every=2',
+                then 'perplexity_iters' value would be {2,4,5} indicating that 
perplexity is computed at
+                iterations 2, 4 and 5 (at the end), unless of course it 
terminated earlier due
+                to 'perplexity_tol'.  If 'iter_num=5' and 'evaluate_every=1', 
then 'perplexity_iters' value
+                would be {1,2,3,4,5} indicating that perplexity is computed at 
every iteration,
+                again assuming it ran the full number of iterations.</td>
+            </tr>
         </table>
     </dd>
     <dt>output_data_table</dt>
@@ -204,14 +228,24 @@ lda_train( data_table,
     <dt>topic_num</dt>
     <dd>INTEGER. Desired number of topics.</dd>
     <dt>iter_num</dt>
-    <dd>INTEGER. Desired number of iterations.</dd>
+    <dd>INTEGER. Maximum number of iterations.  If a 'perplexity_tol' is set,
+    LDA may train for less than the maximum number of iterations if the 
tolerance is reached.</dd>
     <dt>alpha</dt>
     <dd>DOUBLE PRECISION. Dirichlet prior for the per-document topic
     multinomial (e.g., 50/topic_num is a reasonable value to start with
-    as per Griffiths and Steyvers [2] ).</dd>
+    as per Griffiths and Steyvers [2]).</dd>
     <dt>beta</dt>
     <dd>DOUBLE PRECISION. Dirichlet prior for the per-topic
     word multinomial (e.g., 0.01 is a reasonable value to start with).</dd>
+    <dt>evaluate_every (optional)</dt>
+    <dd>INTEGER, default: 0. How often to evaluate perplexity. Set it to 0 or 
a negative number
+    to not evaluate perplexity in training at all. Evaluating perplexity can 
help you check
+    convergence during the training process, but it will also increase total 
training time.
+    For example, evaluating perplexity in every iteration might increase 
training time
+    up to two-fold.</dd>
+    <dt>perplexity_tol (optional)</dt>
+    <dd>DOUBLE PRECISION, default: 0.1. Perplexity tolerance to stop iteration.
+    Only used when the parameter 'evaluate_every' is greater than 0.</dd>
 </dl>
 
 @anchor predict
@@ -246,11 +280,11 @@ lda_predict( data_table,
 @anchor perplexity
 @par Perplexity
 Perplexity describes how well the model fits the data by
-computing word likelihoods averaged over the test documents.
+computing word likelihoods averaged over the documents.
 This function returns a single perplexity value.
 <pre class="syntax">
 lda_get_perplexity( model_table,
-                    output_predict_table
+                    output_data_table
                   );
 </pre>
 \b Arguments
@@ -258,9 +292,10 @@ lda_get_perplexity( model_table,
 <dt>model_table</dt>
     <dd>TEXT. The model table generated by the training process.
     </dd>
-<dt>output_predict_table</dt>
-    <dd>TEXT. The prediction output table generated by the
-    predict function above.
+<dt>output_data_table</dt>
+    <dd>TEXT. Output table generated by the
+    training or predict functions, containing the topic assignments
+    by word.
     </dd>
 </dl>
 
@@ -609,6 +644,20 @@ counts           | 
{1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,2,11,1,1,2,1,1,3,1,1,1,1,1,1,1
 topic_count      | {5,5,26,5,8}
 topic_assignment | 
{4,4,4,0,2,0,0,2,4,4,2,2,2,1,2,4,1,0,2,2,2,2,2,2,2,2,2,2,2,1,2,2,2,2,4,3,3,3,2,3,2,3,2,1,4,2,2,1,0}
 </pre>
+Review summary table:
+<pre class="example">
+SELECT voc_size, topic_num, alpha, beta, num_iterations, perplexity, 
perplexity_iters from lda_model;
+</pre>
+<pre class="result">
+-[ RECORD 1 ]----+-----
+voc_size         | 103
+topic_num        | 5
+alpha            | 5
+beta             | 0.01
+num_iterations   | 10
+perplexity       |
+perplexity_iters |
+</pre>
 
 -# Review learned model using helper functions.
 First, we get topic description by top-k words. These are
@@ -897,6 +946,48 @@ SELECT madlib.lda_get_perplexity( 'lda_model',        -- 
LDA model from training
 (1 row)
 </pre>
 
+-# Perplexity by iteration.  Now let's look at how perplexity
+changes from one iteration to the next:
+<pre class="example">
+DROP TABLE IF EXISTS lda_model_perp, lda_output_data_perp;
+SELECT madlib.lda_train( 'documents_tf',          -- documents table in the 
form of term frequency
+                         'lda_model_perp',        -- model table created by 
LDA training (not human readable)
+                         'lda_output_data_perp',  -- readable output data table
+                         103,                     -- vocabulary size
+                         5,                       -- number of topics
+                         30,                      -- number of iterations
+                         5,                       -- Dirichlet prior for the 
per-doc topic multinomial (alpha)
+                         0.01,                    -- Dirichlet prior for the 
per-topic word multinomial (beta)
+                         2,                       -- Evaluate perplexity every 
n iterations
+                         0.3                      -- Tolerance to stop 
iteration
+                       );
+SELECT voc_size, topic_num, alpha, beta, num_iterations, perplexity, 
perplexity_iters from lda_model_perp;
+</pre>
+<pre class="result">
+-[ RECORD 1 
]----+----------------------------------------------------------------------------------------------------
+voc_size         | 103
+topic_num        | 5
+alpha            | 5
+beta             | 0.01
+num_iterations   | 14
+perplexity       | 
{70.0297335165,65.6497887327,70.2040806534,68.2594871716,70.3816093812,67.9193935299,67.6325562682}
+perplexity_iters | {2,4,6,8,10,12,14}
+</pre>
+Iterating stops at 14 since the tolerance is reached.  There are 7
+perplexity values because we computed it only every 2nd iteration to save time.
+As expected, the perplexity
+on the training data is that same as the final iteration value:
+<pre class="example">
+SELECT madlib.lda_get_perplexity( 'lda_model_perp',
+                                  'lda_output_data_perp'
+                                );
+</pre>
+<pre class="result">
+ lda_get_perplexity
+--------------------+
+    67.632556268157
+</pre>
+
 @anchor literature
 @literature
 
@@ -967,7 +1058,55 @@ RETURNS SETOF MADLIB_SCHEMA.lda_result AS $$
     PythonFunctionBodyOnly(`lda', `lda')
     with AOControl(False):
         lda.lda_train(schema_madlib, data_table, model_table, 
output_data_table,
-                      voc_size, topic_num, iter_num, alpha, beta)
+                      voc_size, topic_num, iter_num, alpha, beta, None, None)
+    return [[model_table, 'model table'],
+        [output_data_table, 'output data table']]
+$$ LANGUAGE plpythonu
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
+
+
+CREATE OR REPLACE FUNCTION
+MADLIB_SCHEMA.lda_train
+(
+    data_table          TEXT,
+    model_table         TEXT,
+    output_data_table   TEXT,
+    voc_size            INT4,
+    topic_num           INT4,
+    iter_num            INT4,
+    alpha               FLOAT8,
+    beta                FLOAT8,
+    evaluate_every      INT4,
+    perplexity_tol      FLOAT8
+)
+RETURNS SETOF MADLIB_SCHEMA.lda_result AS $$
+    PythonFunctionBodyOnly(`lda', `lda')
+    with AOControl(False):
+        lda.lda_train(schema_madlib, data_table, model_table, 
output_data_table,
+                      voc_size, topic_num, iter_num, alpha, 
beta,evaluate_every , perplexity_tol)
+    return [[model_table, 'model table'],
+        [output_data_table, 'output data table']]
+$$ LANGUAGE plpythonu
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
+
+CREATE OR REPLACE FUNCTION
+MADLIB_SCHEMA.lda_train
+(
+    data_table          TEXT,
+    model_table         TEXT,
+    output_data_table   TEXT,
+    voc_size            INT4,
+    topic_num           INT4,
+    iter_num            INT4,
+    alpha               FLOAT8,
+    beta                FLOAT8,
+    evaluate_every      INT4
+)
+RETURNS SETOF MADLIB_SCHEMA.lda_result AS $$
+    PythonFunctionBodyOnly(`lda', `lda')
+    with AOControl(False):
+        lda.lda_train(schema_madlib, data_table, model_table, 
output_data_table,
+                      voc_size, topic_num, iter_num, alpha, 
beta,evaluate_every , None)
     return [[model_table, 'model table'],
         [output_data_table, 'output data table']]
 $$ LANGUAGE plpythonu
diff --git a/src/ports/postgres/modules/lda/test/lda.sql_in 
b/src/ports/postgres/modules/lda/test/lda.sql_in
index c230240..96d5f5f 100644
--- a/src/ports/postgres/modules/lda/test/lda.sql_in
+++ b/src/ports/postgres/modules/lda/test/lda.sql_in
@@ -288,3 +288,280 @@ CREATE OR REPLACE FUNCTION validate_lda_output() RETURNS 
integer AS $$
 $$ LANGUAGE plpgsql;
 
 select validate_lda_output();
+
+
+---------- TEST CASES FOR PERPLEXITY ----------
+
+drop table if exists lda_model, lda_output_data;
+SELECT lda_train(
+    'lda_training',             -- data_table
+    'lda_model',                -- model_table
+    'lda_output_data',          -- output_data_table
+    20,                         -- voc_size
+    5,                          -- topic_num
+    2,                          -- iter_num
+    10,                         -- alpha
+    0.01,                       -- beta
+    2,                          -- evaluate_every
+    .2);                        -- perplexity_tol
+
+SELECT assert(perplexity_iters = '{2}', 'Number of Perplexity iterations are 
wrong') FROM lda_model;
+SELECT assert(perplexity[1] > 0 , 'Perplexity value should be greate than 0') 
FROM lda_model ;
+-- Commenting the below flaky test to re-visit later.
+-- select assert(array_upper(ARRAY(Select distinct unnest(perplexity)),1)= 
array_upper(perplexity,1) , 'Perplexity values should be unique') from 
lda_model ;
+
+
+drop table if exists lda_model, lda_output_data;
+SELECT lda_train(
+    'lda_training',          -- data_table
+    'lda_model',             -- model_table
+    'lda_output_data',       -- output_data_table
+    20,                      -- voc_size
+    5,                       -- topic_num
+    3,                       -- iter_num
+    10,                      -- alpha
+    0.01,                    -- beta
+    1,                       -- evaluate_every
+    .1                       -- perplexity_tol
+    );
+
+SELECT assert(array_upper(perplexity,1) = 3, 'Perplexity calculation is 
wrong') FROM lda_model;
+SELECT assert(perplexity[1] > 0 , 'Perplexity value should be greate than 0') 
FROM lda_model ;
+-- Commenting the below flaky test to re-visit later.
+-- select assert(array_upper(ARRAY(Select distinct unnest(perplexity)),1)= 
array_upper(perplexity,1) , 'Perplexity values should be unique') from 
lda_model ;
+
+
+-- Function to check if the perplexity value returned from the function 
+-- and calculated by the train funcion are same. 
+CREATE OR REPLACE FUNCTION validate_perplexity() RETURNS boolean AS $$
+
+    DECLARE
+        perplexity_from_func Double precision[];
+        perplexity_lda_train Double precision[];
+
+    BEGIN
+        drop table if exists lda_model, lda_output_data;
+        PERFORM lda_train(
+        'lda_training',
+        'lda_model',
+        'lda_output_data',
+        20, 5, 2, 10, 0.01, 2, .2);
+
+         SELECT array_agg(round(lda_get_perplexity::numeric,10))  INTO 
perplexity_from_func from lda_get_perplexity('lda_model','lda_output_data');
+
+          select perplexity INTO perplexity_lda_train from lda_model ;
+
+
+        if perplexity_lda_train != perplexity_from_func  THEN
+            return FALSE;
+        ELSE
+            return TRUE;
+        END IF;    
+
+    END;
+
+$$ LANGUAGE plpgsql;
+
+SELECT assert(validate_perplexity() = TRUE, 'Perplexity calculation is wrong');
+SELECT assert(perplexity[1] > 0 , 'Perplexity value should be greate than 0') 
FROM lda_model ;
+-- Commenting the below flaky test to re-visit later.
+-- select assert(array_upper(ARRAY(Select distinct unnest(perplexity)),1)= 
array_upper(perplexity,1) , 'Perplexity values should be unique') from 
lda_model ;
+
+-- Test for evaluate_every = Number of iterations = 1. It should give exactly 
one perplexity value --
+
+
+drop table if exists lda_model, lda_output_data;
+SELECT lda_train(
+    'lda_training',          -- data_table
+    'lda_model',             -- model_table
+    'lda_output_data',       -- output_data_table
+    20,                      -- voc_size
+    5,                       -- topic_num
+    1,                       -- iter_num
+    10,                      -- alpha
+    0.01,                    -- beta
+    1,                       -- evaluate_every
+    .1                       -- perplexity_tol
+    );
+
+select assert(perplexity != '{}', 'Perplexity should be calculated') from 
lda_model;
+select assert(array_upper(perplexity,1) = 1, 'Perplexity should not have more 
than 1 value') from lda_model;
+
+
+-- Test for evaluate_every = 0  and -1 : In this do not calculate perplexity--
+
+
+
+drop table if exists lda_model, lda_output_data;
+SELECT lda_train(
+    'lda_training',          -- data_table
+    'lda_model',             -- model_table
+    'lda_output_data',       -- output_data_table
+    20,                      -- voc_size
+    5,                       -- topic_num
+    1,                       -- iter_num
+    10,                      -- alpha
+    0.01,                    -- beta   
+    0,                       -- evaluate_every
+    .1                       -- perplexity_tol
+    );
+
+select assert(perplexity = '{}', 'Perplexity should not be calculated') from 
lda_model;
+select assert(perplexity_iters = '{}', 'Perplexity iterations should be null') 
from lda_model ;
+
+-- Test for evaluate_every = 0  and -1 : In this do not calculate perplexity--
+
+
+
+drop table if exists lda_model, lda_output_data;
+SELECT lda_train(
+    'lda_training',          -- data_table
+    'lda_model',             -- model_table
+    'lda_output_data',       -- output_data_table
+    20,                      -- voc_size
+    5,                       -- topic_num
+    1,                       -- iter_num
+    10,                      -- alpha
+    0.01,                    -- beta   
+    -1,                      -- evaluate_every
+    .1                       -- perplexity_tol
+    );
+
+select assert(perplexity = '{}', 'Perplexity should not be calculated') from 
lda_model ;
+select assert(perplexity_iters = '{}', 'Perplexity iterations should be null') 
from lda_model ;
+
+
+-- Test to check if the perplexity_iters are matching the expected value --
+
+drop table if exists lda_model, lda_output_data;
+SELECT lda_train(
+    'lda_training',          -- data_table
+    'lda_model',             -- model_table
+    'lda_output_data',       -- output_data_table
+    20,                      -- voc_size
+    5,                       -- topic_num
+    10,                      -- iter_num
+    10,                      -- alpha
+    0.01,                    -- beta
+    2,                       -- evaluate_every
+    .1                       -- perplexity_tol
+    );
+
+SELECT assert(array_upper(perplexity_iters,1) <= 5, 'Perplexity iterations are 
different from expected') FROM lda_model ;
+SELECT assert(perplexity[1] > 0 , 'Perplexity value should be greate than 0') 
FROM lda_model ;
+-- Commenting the below flaky test to re-visit later.
+-- select assert(array_upper(ARRAY(Select distinct unnest(perplexity)),1)= 
array_upper(perplexity,1) , 'Perplexity values should be unique') from 
lda_model ;
+
+
+-- Test: If the difference between any two iterations is less than the 
perplexity_tol, we will stop the training. --
+-- In this case, it will iterate to two iterations only as the perplexity_tol 
is very large and the difference between the 2 and 4th iteration -- 
+-- will be less than 10, so Only 2 iterations will be recorded --
+
+
+drop table if exists lda_model, lda_output_data;
+SELECT lda_train(
+    'lda_training',          -- data_table
+    'lda_model',             -- model_table
+    'lda_output_data',       -- output_data_table
+    20,                      -- voc_size
+    5,                       -- topic_num
+    10,                      -- iter_num
+    10,                      -- alpha
+    0.01,                    -- beta
+    2,                       -- evaluate_every
+    100                      -- perplexity_tol
+    );
+
+
+SELECT assert(abs(perplexity[2] - perplexity[1]) <100, 'Perplexity tol is less 
than the perplexity difference') FROM lda_model ;
+SELECT assert(array_upper(perplexity_iters,1)  = 2, 'Perplexity iterations are 
different from expected') FROM lda_model ;
+SELECT assert(perplexity[1] > 0 , 'Perplexity value should be greate than 0') 
FROM lda_model ;
+-- Commenting the below flaky test to re-visit later.
+-- select assert(array_upper(ARRAY(Select distinct unnest(perplexity)),1)= 
array_upper(perplexity,1) , 'Perplexity values should be unique') from 
lda_model ;
+
+
+-- Test for evaluate_every = 1  and 0 : In this case the iterations should not 
stop early --
+
+
+
+drop table if exists lda_model, lda_output_data;
+SELECT lda_train(
+    'lda_training',          -- data_table
+    'lda_model',             -- model_table
+    'lda_output_data',       -- output_data_table
+    20,                      -- voc_size
+    5,                       -- topic_num
+    10,                      -- iter_num
+    10,                      -- alpha
+    0.01,                    -- beta   
+    1,                       -- evaluate_every
+    0                        -- perplexity_tol
+    );
+
+select assert(num_iterations = 10, 'Perplexity should run for all the 
iterations') from lda_model ;
+
+
+-- Test for evaluate_every = NULL and perplexity_tol = NULL.  In this case it 
should not calculate perplexity -- 
+
+
+drop table if exists lda_model, lda_output_data;
+SELECT lda_train(
+    'lda_training',          -- data_table
+    'lda_model',             -- model_table
+    'lda_output_data',       -- output_data_table
+    20,                      -- voc_size
+    5,                       -- topic_num
+    10,                      -- iter_num
+    10,                      -- alpha
+    0.01,                    -- beta   
+    NULL,                    -- evaluate_every
+    NULL                     -- perplexity_tol
+    );
+
+select assert(perplexity = '{}', 'Perplexity should not be calculated') from 
lda_model ;
+select assert(perplexity_iters = '{}', 'Perplexity iterations should be null') 
from lda_model ;
+
+
+-- Test for evaluate_every = 1 and perplexity_tol = NULL. --
+-- In this case it should calculate perplexity with perplexity_tol = 0.1 as 
default value -- 
+
+
+drop table if exists lda_model, lda_output_data;
+SELECT lda_train(
+    'lda_training',          -- data_table
+    'lda_model',             -- model_table
+    'lda_output_data',       -- output_data_table
+    20,                      -- voc_size
+    5,                       -- topic_num
+    10,                      -- iter_num
+    10,                      -- alpha
+    0.01,                    -- beta   
+    1,                       -- evaluate_every
+    NULL                     -- perplexity_tol
+    );
+
+select assert(array_upper(perplexity_iters,1) >= 1, 'Perplexity iterations are 
different from expected') from lda_model ;
+select assert(perplexity != '{}', 'Perplexity should be calculated') from 
lda_model;
+
+
+-- Test for evaluate_every = NULL and perplexity_tol != NULL --
+-- In this case it should not calculate perplexity -- 
+
+
+drop table if exists lda_model, lda_output_data;
+SELECT lda_train(
+    'lda_training',          -- data_table
+    'lda_model',             -- model_table
+    'lda_output_data',       -- output_data_table
+    20,                      -- voc_size
+    5,                       -- topic_num
+    10,                      -- iter_num
+    10,                      -- alpha
+    0.01,                    -- beta   
+    NULL,                    -- evaluate_every
+    1                        -- perplexity_tol
+    );
+
+select assert(perplexity = '{}', 'Perplexity should not be calculated') from 
lda_model ;
+select assert(perplexity_iters = '{}', 'Perplexity iterations should be null') 
from lda_model ;
+

[madlib] branch master updated: MADLIB-1351 : Added stopping criteria on perplexity to LDA

Reply via email to