I've uploaded a patch (against quick-cluster.c in 0.8.4)  that adds
support for the /PRINT=CLUSTER subcommand for k-means clustering to show
the cluster membership for each case:

https://savannah.gnu.org/bugs/index.php?41019

But this patch has a remaining bug.  The clusters centers are saved in
some indirect fashion that I cannot understand. 

In the patch, I report the cluster number returned by
kmeans_get_nearest_group() but these cluster numbers are systematically
different from the reported cluster numbers.  That is, the centers are
stored internally in arbitrary order (as they are discovered, I'd guess)
and for purposes of reporting, they are numbered.  I cannot replicate
that output numbering.

For example, in the attached output, the centers were (10,10),
(-10,-10), and (-10,10) and 20 cases were generated for each cluster. 
The CLUSTER command reports 1 = (-10.23, -10.01), 2 = (-10.19, 10.18)
and 3=(10.27, 9.82) so the first 20 cases should be members of cluster
3, the next 20 from cluster 3 and the last 20 from cluster 2.  But using
the results from kmeans_get_nearest_group(), the clusters are reported
as 1, then 3, then 2.

I don't understand how I can fix this.  I think I need to use
kmeans->group_order which is a "gsl_permutation" but this is beyond my
familiarity with C and GSL.

It's also possible that kmeans_order_groups() (which is called at the
beginning of quick_cluster_show_results()) is not working properly.

Any advice?

-Alan

-- 

Alan D. Mead, Ph.D.
President, Talent Algorithms Inc.

science + technology = better workers

+815.588.3846 (Office)
+267.334.4143 (Mobile)

http://www.alanmead.org

Announcing the Journal of Computerized Adaptive Testing (JCAT), a
peer-reviewed electronic journal designed to advance the science and
practice of computerized adaptive testing: http://www.iacat.org/jcat

--- pspp-0.8.4-original/src/language/stats/quick-cluster.c      2014-08-24 
00:51:41.000000000 -0500
+++ pspp-0.8.4/src/language/stats/quick-cluster.c       2015-05-30 
16:58:46.049760839 -0500
@@ -60,6 +60,10 @@
 
   int ngroups;                 /* Number of group. (Given by the user) */
   int maxiter;                 /* Maximum iterations (Given by the user) */
+  int print_cluster_membership; /* true => print membership */
+  int print_initial_clusters;   /* true => print initial cluster */
+  int save_cluster_membership;  /* true => save cluster membership as QCL_1 in 
dataset */
+  int save_cluster_distance;    /* true => save cluster distance as QCL_2 in 
dataset */
 
   const struct variable *wv;   /* Weighting variable. */
 
@@ -89,7 +93,7 @@
 
 static struct Kmeans *kmeans_create (const struct qc *qc);
 
-static void kmeans_randomize_centers (struct Kmeans *kmeans, const struct qc 
*qc);
+static void kmeans_randomize_centers (struct Kmeans *kmeans, const struct 
casereader *reader, const struct qc *qc);
 
 static int kmeans_get_nearest_group (struct Kmeans *kmeans, struct ccase *c, 
const struct qc *);
 
@@ -104,9 +108,11 @@
 
 static void quick_cluster_show_centers (struct Kmeans *kmeans, bool initial, 
const struct qc *);
 
+static void quick_cluster_show_membership (struct Kmeans *kmeans, const struct 
casereader *reader, const struct qc *);
+
 static void quick_cluster_show_number_cases (struct Kmeans *kmeans, const 
struct qc *);
 
-static void quick_cluster_show_results (struct Kmeans *kmeans, const struct qc 
*);
+static void quick_cluster_show_results (struct Kmeans *kmeans, const struct 
casereader *reader, const struct qc *);
 
 int cmd_quick_cluster (struct lexer *lexer, struct dataset *ds);
 
@@ -152,7 +158,7 @@
 
 /* Creates random centers using randomly selected cases from the data. */
 static void
-kmeans_randomize_centers (struct Kmeans *kmeans, const struct qc *qc)
+kmeans_randomize_centers (struct Kmeans *kmeans, const struct casereader 
*reader, const struct qc *qc)
 {
   int i, j;
   for (i = 0; i < qc->ngroups; i++)
@@ -346,11 +352,12 @@
   bool redo;
   int diffs;
   bool show_warning1;
+  int redo_count = 0;
 
   show_warning1 = true;
 cluster:
   redo = false;
-  kmeans_randomize_centers (kmeans, qc);
+  kmeans_randomize_centers (kmeans, reader, qc);
   for (kmeans->lastiter = 0; kmeans->lastiter < qc->maxiter;
        kmeans->lastiter++)
     {
@@ -377,8 +384,13 @@
          break;
        }
     }
+
   if (redo)
-    goto cluster;
+    {
+      redo_count++;
+      assert (redo_count < 10);
+      goto cluster;
+    }
 
 }
 
@@ -446,6 +458,43 @@
   tab_submit (t);
 }
 
+/* Reports cluster membership for each case. */
+static void
+quick_cluster_show_membership (struct Kmeans *kmeans, const struct casereader 
*reader, const struct qc *qc)
+{
+  struct tab_table *t;
+  int nc, nr;
+  int i, clust; 
+  struct ccase *c;
+  struct casereader *cs = casereader_clone (reader);
+  nc = 2;
+  nr = kmeans->n + 1;
+  t = tab_create (nc, nr);
+  tab_headers (t, 0, nc - 1, 0, 0);
+  tab_title (t, _("Cluster Membership"));
+  tab_text (t, 0, 0, TAB_CENTER, _("Case Number"));
+  tab_text (t, 1, 0, TAB_CENTER, _("Cluster"));
+  tab_box (t, TAL_2, TAL_2, TAL_0, TAL_1, 0, 0, nc - 1, nr - 1);
+  //tab_text (t, 0, 1, TAB_CENTER, _("1"));
+  //tab_text (t, 1, 1, TAB_CENTER, _("2"));
+  //tab_text (t, 0, 2, TAB_CENTER, _("2"));
+  //tab_text (t, 1, 2, TAB_CENTER, _("3"));
+
+  //for (i = 0; i < kmeans->n; i++)
+  for (i = 0; (c = casereader_read (cs)) != NULL; i++, case_unref (c))
+    {
+      assert (i < kmeans->n);
+      clust = kmeans_get_nearest_group (kmeans, c, qc);
+      //int clust = 1; // kmeans_get_nearest_group (kmeans, c, qc);
+      tab_text_format (t, 0, i+1, TAB_CENTER, "%d", (i + 1));
+      tab_text_format (t, 1, i+1, TAB_CENTER, "%d", (clust + 1));
+    }
+  assert (i == kmeans->n);
+  tab_submit (t);
+  casereader_destroy (cs);
+}
+
+
 /* Reports number of cases of each single cluster. */
 static void
 quick_cluster_show_number_cases (struct Kmeans *kmeans, const struct qc *qc)
@@ -479,13 +528,15 @@
 
 /* Reports. */
 static void
-quick_cluster_show_results (struct Kmeans *kmeans, const struct qc *qc)
+quick_cluster_show_results (struct Kmeans *kmeans, const struct casereader 
*reader, const struct qc *qc)
 {
   kmeans_order_groups (kmeans, qc);
-  /* Uncomment the line below for reporting initial centers. */
-  /* quick_cluster_show_centers (kmeans, true); */
+  if( qc->print_initial_clusters )
+    quick_cluster_show_centers (kmeans, true, qc);
   quick_cluster_show_centers (kmeans, false, qc);
   quick_cluster_show_number_cases (kmeans, qc);
+  if( qc->print_cluster_membership )
+     quick_cluster_show_membership(kmeans, reader, qc);
 }
 
 int
@@ -499,6 +550,10 @@
   qc.maxiter = 2;
   qc.missing_type = MISS_LISTWISE;
   qc.exclude = MV_ANY;
+  qc.print_cluster_membership = false; /* default = do not output case cluster 
membership */
+  qc.print_initial_clusters = false;   /* default = do not print initial 
clusters */
+  qc.save_cluster_membership = false;  /* default = do not save case cluster 
membership */
+  qc.save_cluster_distance = false;    /* default = do not save case cluster 
distance */
 
   if (!parse_variables_const (lexer, dict, &qc.vars, &qc.n_vars,
                              PV_NO_DUPLICATE | PV_NUMERIC))
@@ -536,6 +591,34 @@
                goto error;
            }     
        }
+      else if (lex_match_id (lexer, "SAVE"))
+       {
+         lex_match (lexer, T_EQUALS);
+         while (lex_token (lexer) != T_ENDCMD
+                && lex_token (lexer) != T_SLASH)
+           {
+             if (lex_match_id (lexer, "CLUSTER"))
+                qc.save_cluster_membership = true;
+             else if (lex_match_id (lexer, "DISTANCE"))
+               qc.save_cluster_distance = true;
+             else
+                goto error;
+           }
+       }
+      else if (lex_match_id (lexer, "PRINT"))
+       {
+         lex_match (lexer, T_EQUALS);
+         while (lex_token (lexer) != T_ENDCMD
+                && lex_token (lexer) != T_SLASH)
+           {
+             if (lex_match_id (lexer, "CLUSTER"))
+                qc.print_cluster_membership = true;
+             else if (lex_match_id (lexer, "INITIAL"))
+               qc.print_initial_clusters = true;
+             else
+                goto error;
+           }
+       }
       else if (lex_match_id (lexer, "CRITERIA"))
        {
          lex_match (lexer, T_EQUALS);
@@ -595,7 +678,7 @@
 
        kmeans = kmeans_create (&qc);
        kmeans_cluster (kmeans, group, &qc);
-       quick_cluster_show_results (kmeans, &qc);
+       quick_cluster_show_results (kmeans, group, &qc);
        kmeans_destroy (kmeans);
        casereader_destroy (group);
       }

Attachment: qc.pdf
Description: Adobe PDF document

Attachment: qc.sps
Description: application/spss-sps

case_id,cluster,x,y
1,1,10.45,9.38
2,1,10.67,9.17
3,1,10.86,9.63
4,1,8.77,8.45
5,1,8.04,11.77
6,1,10.34,9.83
7,1,10.37,10.54
8,1,11.49,8.18
9,1,10.17,11.10
10,1,11.37,9.16
11,1,10.25,8.83
12,1,8.69,9.92
13,1,10.36,10.39
14,1,10.89,10.51
15,1,9.91,11.39
16,1,11.11,10.91
17,1,11.77,8.47
18,1,9.51,10.46
19,1,10.67,9.33
20,1,9.74,8.85
21,2,-11.01,-9.21
22,2,-10.82,-11.76
23,2,-10.03,-10.29
24,2,-9.54,-9.17
25,2,-10.16,-9.82
26,2,-10.01,-8.63
27,2,-9.62,-10.22
28,2,-11.36,-10.93
29,2,-10.63,-10.97
30,2,-9.53,-10.78
31,2,-9.40,-10.26
32,2,-10.76,-9.76
33,2,-9.92,-10.11
34,2,-10.16,-9.75
35,2,-8.65,-11.31
36,2,-10.10,-10.90
37,2,-11.67,-9.89
38,2,-11.11,-9.23
39,2,-8.72,-8.43
40,2,-11.35,-8.68
41,3,-11.07,9.42
42,3,-10.20,9.00
43,3,-10.12,9.92
44,3,-10.41,10.16
45,3,-9.86,10.12
46,3,-10.31,10.12
47,3,-9.57,10.16
48,3,-9.69,9.93
49,3,-9.14,10.84
50,3,-9.83,10.19
51,3,-9.97,10.22
52,3,-11.65,10.81
53,3,-9.80,11.39
54,3,-10.31,10.74
55,3,-10.26,10.38
56,3,-11.57,10.02
57,3,-10.50,9.75
58,3,-9.06,9.63
59,3,-10.17,10.82
60,3,-10.22,9.99
_______________________________________________
pspp-dev mailing list
[email protected]
https://lists.gnu.org/mailman/listinfo/pspp-dev

Reply via email to