Attached is a patch to 0.8.5 quick-cluster.c to support the
/PRINT=CLUSTER subcommand that identifies the cluster membership of each
case in the analysis.  I also included test syntax, data, and output.

John has suggested some stylistic changes that are included in this
patch, but three last three suggestions are not in this patch:

> I think this patch could be checked in if we also do the following:
>
>
> 1.  Add the necessary changes to doc/statistics.texi documenting the
>    /PRINT subcommand.
>
> 2.  Add a test to tests/language/stats/quick-cluster.at which exercises
>    the /PRINT subcommand.
>
> 3. We should omit the syntax parsing code for the /SAVE command, because
>    it doesn't do anything yet.

In addition, I've been looking over the regression.c code to see how it
implements /SAVE but I'm not sure I can replicate it in the cluster code.

-Alan



On 7/27/2015 10:07 PM, Ben Pfaff wrote:
> On Tue, Jul 28, 2015 at 04:47:51AM +0200, John Darrington wrote:
>> On Mon, Jul 27, 2015 at 12:11:19PM -0500, Alan Mead wrote:
>>      
>>      In the attached files, the .patch is the patch for 
>>      pspp-0.8.5/src/language/stats/quick-cluster.c (let me know if it's done
>>      incorrectly)
>>
>> It looks good to me.  Just some minor quibbles about style:
> It seems that I didn't receive the patch, and it doesn't seem to be in
> the mailing list archive.  Alan, perhaps you should re-send it?

-- 

Alan D. Mead, Ph.D.
President, Talent Algorithms Inc.

science + technology = better workers

+815.588.3846 (Office)
+267.334.4143 (Mobile)

http://www.alanmead.org

Announcing the Journal of Computerized Adaptive Testing (JCAT), a
peer-reviewed electronic journal designed to advance the science and
practice of computerized adaptive testing: http://www.iacat.org/jcat

--- pspp-0.8.5-original/src/language/stats/quick-cluster.c      2015-05-30 
18:13:57.000000000 -0500
+++ pspp-0.8.5/src/language/stats/quick-cluster.c       2015-07-27 
07:59:42.455444022 -0500
@@ -60,6 +60,10 @@
 
   int ngroups;                 /* Number of group. (Given by the user) */
   int maxiter;                 /* Maximum iterations (Given by the user) */
+  int print_cluster_membership; /* true => print membership */
+  int print_initial_clusters;   /* true => print initial cluster */
+  int save_cluster_membership;  /* true => save cluster membership as QCL_1 in 
dataset */
+  int save_cluster_distance;    /* true => save cluster distance as QCL_2 in 
dataset */
 
   const struct variable *wv;   /* Weighting variable. */
 
@@ -89,7 +93,7 @@
 
 static struct Kmeans *kmeans_create (const struct qc *qc);
 
-static void kmeans_randomize_centers (struct Kmeans *kmeans, const struct qc 
*qc);
+static void kmeans_randomize_centers (struct Kmeans *kmeans, const struct 
casereader *reader, const struct qc *qc);
 
 static int kmeans_get_nearest_group (struct Kmeans *kmeans, struct ccase *c, 
const struct qc *);
 
@@ -104,9 +108,11 @@
 
 static void quick_cluster_show_centers (struct Kmeans *kmeans, bool initial, 
const struct qc *);
 
+static void quick_cluster_show_membership (struct Kmeans *kmeans, const struct 
casereader *reader, const struct qc *);
+
 static void quick_cluster_show_number_cases (struct Kmeans *kmeans, const 
struct qc *);
 
-static void quick_cluster_show_results (struct Kmeans *kmeans, const struct qc 
*);
+static void quick_cluster_show_results (struct Kmeans *kmeans, const struct 
casereader *reader, const struct qc *);
 
 int cmd_quick_cluster (struct lexer *lexer, struct dataset *ds);
 
@@ -152,7 +158,7 @@
 
 /* Creates random centers using randomly selected cases from the data. */
 static void
-kmeans_randomize_centers (struct Kmeans *kmeans, const struct qc *qc)
+kmeans_randomize_centers (struct Kmeans *kmeans, const struct casereader 
*reader, const struct qc *qc)
 {
   int i, j;
   for (i = 0; i < qc->ngroups; i++)
@@ -346,11 +352,12 @@
   bool redo;
   int diffs;
   bool show_warning1;
+  int redo_count = 0;
 
   show_warning1 = true;
 cluster:
   redo = false;
-  kmeans_randomize_centers (kmeans, qc);
+  kmeans_randomize_centers (kmeans, reader, qc);
   for (kmeans->lastiter = 0; kmeans->lastiter < qc->maxiter;
        kmeans->lastiter++)
     {
@@ -377,8 +384,13 @@
          break;
        }
     }
+
   if (redo)
-    goto cluster;
+    {
+      redo_count++;
+      assert (redo_count < 10);
+      goto cluster;
+    }
 
 }
 
@@ -446,6 +458,40 @@
   tab_submit (t);
 }
 
+/* Reports cluster membership for each case. */
+static void
+quick_cluster_show_membership (struct Kmeans *kmeans, const struct casereader 
*reader, const struct qc *qc)
+{
+  struct tab_table *t;
+  int nc, nr;
+  int i, clust; 
+  struct ccase *c;
+  struct casereader *cs = casereader_clone (reader);
+  nc = 2;
+  nr = kmeans->n + 1;
+  t = tab_create (nc, nr);
+  tab_headers (t, 0, nc - 1, 0, 0);
+  tab_title (t, _("Cluster Membership"));
+  tab_text (t, 0, 0, TAB_CENTER, _("Case Number"));
+  tab_text (t, 1, 0, TAB_CENTER, _("Cluster"));
+  tab_box (t, TAL_2, TAL_2, TAL_0, TAL_1, 0, 0, nc - 1, nr - 1);
+
+  for (i = 0; (c = casereader_read (cs)) != NULL; i++, case_unref (c))
+    {
+      assert (i < kmeans->n);
+      /* get cluster membership (does not reflect re-ordered centers) */
+      clust = kmeans_get_nearest_group (kmeans, c, qc); 
+      /* make cluster membership reflect re-ordered centers */
+      clust = kmeans->group_order->data[clust]; 
+      tab_text_format (t, 0, i + 1, TAB_CENTER, "%d", (i + 1));
+      tab_text_format (t, 1, i + 1, TAB_CENTER, "%d", (clust + 1));
+    }
+  assert (i == kmeans->n);
+  tab_submit (t);
+  casereader_destroy (cs);
+}
+
+
 /* Reports number of cases of each single cluster. */
 static void
 quick_cluster_show_number_cases (struct Kmeans *kmeans, const struct qc *qc)
@@ -479,13 +525,15 @@
 
 /* Reports. */
 static void
-quick_cluster_show_results (struct Kmeans *kmeans, const struct qc *qc)
+quick_cluster_show_results (struct Kmeans *kmeans, const struct casereader 
*reader, const struct qc *qc)
 {
-  kmeans_order_groups (kmeans, qc);
-  /* Uncomment the line below for reporting initial centers. */
-  /* quick_cluster_show_centers (kmeans, true); */
+  kmeans_order_groups (kmeans, qc); /* what does this do? */
+  if( qc->print_initial_clusters )
+    quick_cluster_show_centers (kmeans, true, qc);
   quick_cluster_show_centers (kmeans, false, qc);
   quick_cluster_show_number_cases (kmeans, qc);
+  if( qc->print_cluster_membership )
+     quick_cluster_show_membership(kmeans, reader, qc);
 }
 
 int
@@ -499,6 +547,10 @@
   qc.maxiter = 2;
   qc.missing_type = MISS_LISTWISE;
   qc.exclude = MV_ANY;
+  qc.print_cluster_membership = false; /* default = do not output case cluster 
membership */
+  qc.print_initial_clusters = false;   /* default = do not print initial 
clusters */
+  qc.save_cluster_membership = false;  /* default = do not save case cluster 
membership */
+  qc.save_cluster_distance = false;    /* default = do not save case cluster 
distance */
 
   if (!parse_variables_const (lexer, dict, &qc.vars, &qc.n_vars,
                              PV_NO_DUPLICATE | PV_NUMERIC))
@@ -536,6 +588,34 @@
                goto error;
            }     
        }
+      else if (lex_match_id (lexer, "SAVE"))
+       {
+         lex_match (lexer, T_EQUALS);
+         while (lex_token (lexer) != T_ENDCMD
+                && lex_token (lexer) != T_SLASH)
+           {
+             if (lex_match_id (lexer, "CLUSTER"))
+                qc.save_cluster_membership = true;
+             else if (lex_match_id (lexer, "DISTANCE"))
+               qc.save_cluster_distance = true;
+             else
+                goto error;
+           }
+       }
+      else if (lex_match_id (lexer, "PRINT"))
+       {
+         lex_match (lexer, T_EQUALS);
+         while (lex_token (lexer) != T_ENDCMD
+                && lex_token (lexer) != T_SLASH)
+           {
+             if (lex_match_id (lexer, "CLUSTER"))
+                qc.print_cluster_membership = true;
+             else if (lex_match_id (lexer, "INITIAL"))
+               qc.print_initial_clusters = true;
+             else
+                goto error;
+           }
+       }
       else if (lex_match_id (lexer, "CRITERIA"))
        {
          lex_match (lexer, T_EQUALS);
@@ -600,7 +680,7 @@
 
        kmeans = kmeans_create (&qc);
        kmeans_cluster (kmeans, group, &qc);
-       quick_cluster_show_results (kmeans, &qc);
+       quick_cluster_show_results (kmeans, group, &qc);
        kmeans_destroy (kmeans);
        casereader_destroy (group);
       }

Attachment: qc.sps
Description: application/spss-sps

case_id,cluster,x,y
1,1,10.45,9.38
2,1,10.67,9.17
3,1,10.86,9.63
4,1,8.77,8.45
5,1,8.04,11.77
6,1,10.34,9.83
7,1,10.37,10.54
8,1,11.49,8.18
9,1,10.17,11.10
10,1,11.37,9.16
11,1,10.25,8.83
12,1,8.69,9.92
13,1,10.36,10.39
14,1,10.89,10.51
15,1,9.91,11.39
16,1,11.11,10.91
17,1,11.77,8.47
18,1,9.51,10.46
19,1,10.67,9.33
20,1,9.74,8.85
21,2,-11.01,-9.21
22,2,-10.82,-11.76
23,2,-10.03,-10.29
24,2,-9.54,-9.17
25,2,-10.16,-9.82
26,2,-10.01,-8.63
27,2,-9.62,-10.22
28,2,-11.36,-10.93
29,2,-10.63,-10.97
30,2,-9.53,-10.78
31,2,-9.40,-10.26
32,2,-10.76,-9.76
33,2,-9.92,-10.11
34,2,-10.16,-9.75
35,2,-8.65,-11.31
36,2,-10.10,-10.90
37,2,-11.67,-9.89
38,2,-11.11,-9.23
39,2,-8.72,-8.43
40,2,-11.35,-8.68
41,3,-11.07,9.42
42,3,-10.20,9.00
43,3,-10.12,9.92
44,3,-10.41,10.16
45,3,-9.86,10.12
46,3,-10.31,10.12
47,3,-9.57,10.16
48,3,-9.69,9.93
49,3,-9.14,10.84
50,3,-9.83,10.19
51,3,-9.97,10.22
52,3,-11.65,10.81
53,3,-9.80,11.39
54,3,-10.31,10.74
55,3,-10.26,10.38
56,3,-11.57,10.02
57,3,-10.50,9.75
58,3,-9.06,9.63
59,3,-10.17,10.82
60,3,-10.22,9.99

Attachment: test3.pdf
Description: Adobe PDF document

_______________________________________________
pspp-dev mailing list
[email protected]
https://lists.gnu.org/mailman/listinfo/pspp-dev

Reply via email to