I've uploaded a patch (against quick-cluster.c in 0.8.4) that adds support for the /PRINT=CLUSTER subcommand for k-means clustering to show the cluster membership for each case:
https://savannah.gnu.org/bugs/index.php?41019 But this patch has a remaining bug. The clusters centers are saved in some indirect fashion that I cannot understand. In the patch, I report the cluster number returned by kmeans_get_nearest_group() but these cluster numbers are systematically different from the reported cluster numbers. That is, the centers are stored internally in arbitrary order (as they are discovered, I'd guess) and for purposes of reporting, they are numbered. I cannot replicate that output numbering. For example, in the attached output, the centers were (10,10), (-10,-10), and (-10,10) and 20 cases were generated for each cluster. The CLUSTER command reports 1 = (-10.23, -10.01), 2 = (-10.19, 10.18) and 3=(10.27, 9.82) so the first 20 cases should be members of cluster 3, the next 20 from cluster 3 and the last 20 from cluster 2. But using the results from kmeans_get_nearest_group(), the clusters are reported as 1, then 3, then 2. I don't understand how I can fix this. I think I need to use kmeans->group_order which is a "gsl_permutation" but this is beyond my familiarity with C and GSL. It's also possible that kmeans_order_groups() (which is called at the beginning of quick_cluster_show_results()) is not working properly. Any advice? -Alan -- Alan D. Mead, Ph.D. President, Talent Algorithms Inc. science + technology = better workers +815.588.3846 (Office) +267.334.4143 (Mobile) http://www.alanmead.org Announcing the Journal of Computerized Adaptive Testing (JCAT), a peer-reviewed electronic journal designed to advance the science and practice of computerized adaptive testing: http://www.iacat.org/jcat
--- pspp-0.8.4-original/src/language/stats/quick-cluster.c 2014-08-24
00:51:41.000000000 -0500
+++ pspp-0.8.4/src/language/stats/quick-cluster.c 2015-05-30
16:58:46.049760839 -0500
@@ -60,6 +60,10 @@
int ngroups; /* Number of group. (Given by the user) */
int maxiter; /* Maximum iterations (Given by the user) */
+ int print_cluster_membership; /* true => print membership */
+ int print_initial_clusters; /* true => print initial cluster */
+ int save_cluster_membership; /* true => save cluster membership as QCL_1 in
dataset */
+ int save_cluster_distance; /* true => save cluster distance as QCL_2 in
dataset */
const struct variable *wv; /* Weighting variable. */
@@ -89,7 +93,7 @@
static struct Kmeans *kmeans_create (const struct qc *qc);
-static void kmeans_randomize_centers (struct Kmeans *kmeans, const struct qc
*qc);
+static void kmeans_randomize_centers (struct Kmeans *kmeans, const struct
casereader *reader, const struct qc *qc);
static int kmeans_get_nearest_group (struct Kmeans *kmeans, struct ccase *c,
const struct qc *);
@@ -104,9 +108,11 @@
static void quick_cluster_show_centers (struct Kmeans *kmeans, bool initial,
const struct qc *);
+static void quick_cluster_show_membership (struct Kmeans *kmeans, const struct
casereader *reader, const struct qc *);
+
static void quick_cluster_show_number_cases (struct Kmeans *kmeans, const
struct qc *);
-static void quick_cluster_show_results (struct Kmeans *kmeans, const struct qc
*);
+static void quick_cluster_show_results (struct Kmeans *kmeans, const struct
casereader *reader, const struct qc *);
int cmd_quick_cluster (struct lexer *lexer, struct dataset *ds);
@@ -152,7 +158,7 @@
/* Creates random centers using randomly selected cases from the data. */
static void
-kmeans_randomize_centers (struct Kmeans *kmeans, const struct qc *qc)
+kmeans_randomize_centers (struct Kmeans *kmeans, const struct casereader
*reader, const struct qc *qc)
{
int i, j;
for (i = 0; i < qc->ngroups; i++)
@@ -346,11 +352,12 @@
bool redo;
int diffs;
bool show_warning1;
+ int redo_count = 0;
show_warning1 = true;
cluster:
redo = false;
- kmeans_randomize_centers (kmeans, qc);
+ kmeans_randomize_centers (kmeans, reader, qc);
for (kmeans->lastiter = 0; kmeans->lastiter < qc->maxiter;
kmeans->lastiter++)
{
@@ -377,8 +384,13 @@
break;
}
}
+
if (redo)
- goto cluster;
+ {
+ redo_count++;
+ assert (redo_count < 10);
+ goto cluster;
+ }
}
@@ -446,6 +458,43 @@
tab_submit (t);
}
+/* Reports cluster membership for each case. */
+static void
+quick_cluster_show_membership (struct Kmeans *kmeans, const struct casereader
*reader, const struct qc *qc)
+{
+ struct tab_table *t;
+ int nc, nr;
+ int i, clust;
+ struct ccase *c;
+ struct casereader *cs = casereader_clone (reader);
+ nc = 2;
+ nr = kmeans->n + 1;
+ t = tab_create (nc, nr);
+ tab_headers (t, 0, nc - 1, 0, 0);
+ tab_title (t, _("Cluster Membership"));
+ tab_text (t, 0, 0, TAB_CENTER, _("Case Number"));
+ tab_text (t, 1, 0, TAB_CENTER, _("Cluster"));
+ tab_box (t, TAL_2, TAL_2, TAL_0, TAL_1, 0, 0, nc - 1, nr - 1);
+ //tab_text (t, 0, 1, TAB_CENTER, _("1"));
+ //tab_text (t, 1, 1, TAB_CENTER, _("2"));
+ //tab_text (t, 0, 2, TAB_CENTER, _("2"));
+ //tab_text (t, 1, 2, TAB_CENTER, _("3"));
+
+ //for (i = 0; i < kmeans->n; i++)
+ for (i = 0; (c = casereader_read (cs)) != NULL; i++, case_unref (c))
+ {
+ assert (i < kmeans->n);
+ clust = kmeans_get_nearest_group (kmeans, c, qc);
+ //int clust = 1; // kmeans_get_nearest_group (kmeans, c, qc);
+ tab_text_format (t, 0, i+1, TAB_CENTER, "%d", (i + 1));
+ tab_text_format (t, 1, i+1, TAB_CENTER, "%d", (clust + 1));
+ }
+ assert (i == kmeans->n);
+ tab_submit (t);
+ casereader_destroy (cs);
+}
+
+
/* Reports number of cases of each single cluster. */
static void
quick_cluster_show_number_cases (struct Kmeans *kmeans, const struct qc *qc)
@@ -479,13 +528,15 @@
/* Reports. */
static void
-quick_cluster_show_results (struct Kmeans *kmeans, const struct qc *qc)
+quick_cluster_show_results (struct Kmeans *kmeans, const struct casereader
*reader, const struct qc *qc)
{
kmeans_order_groups (kmeans, qc);
- /* Uncomment the line below for reporting initial centers. */
- /* quick_cluster_show_centers (kmeans, true); */
+ if( qc->print_initial_clusters )
+ quick_cluster_show_centers (kmeans, true, qc);
quick_cluster_show_centers (kmeans, false, qc);
quick_cluster_show_number_cases (kmeans, qc);
+ if( qc->print_cluster_membership )
+ quick_cluster_show_membership(kmeans, reader, qc);
}
int
@@ -499,6 +550,10 @@
qc.maxiter = 2;
qc.missing_type = MISS_LISTWISE;
qc.exclude = MV_ANY;
+ qc.print_cluster_membership = false; /* default = do not output case cluster
membership */
+ qc.print_initial_clusters = false; /* default = do not print initial
clusters */
+ qc.save_cluster_membership = false; /* default = do not save case cluster
membership */
+ qc.save_cluster_distance = false; /* default = do not save case cluster
distance */
if (!parse_variables_const (lexer, dict, &qc.vars, &qc.n_vars,
PV_NO_DUPLICATE | PV_NUMERIC))
@@ -536,6 +591,34 @@
goto error;
}
}
+ else if (lex_match_id (lexer, "SAVE"))
+ {
+ lex_match (lexer, T_EQUALS);
+ while (lex_token (lexer) != T_ENDCMD
+ && lex_token (lexer) != T_SLASH)
+ {
+ if (lex_match_id (lexer, "CLUSTER"))
+ qc.save_cluster_membership = true;
+ else if (lex_match_id (lexer, "DISTANCE"))
+ qc.save_cluster_distance = true;
+ else
+ goto error;
+ }
+ }
+ else if (lex_match_id (lexer, "PRINT"))
+ {
+ lex_match (lexer, T_EQUALS);
+ while (lex_token (lexer) != T_ENDCMD
+ && lex_token (lexer) != T_SLASH)
+ {
+ if (lex_match_id (lexer, "CLUSTER"))
+ qc.print_cluster_membership = true;
+ else if (lex_match_id (lexer, "INITIAL"))
+ qc.print_initial_clusters = true;
+ else
+ goto error;
+ }
+ }
else if (lex_match_id (lexer, "CRITERIA"))
{
lex_match (lexer, T_EQUALS);
@@ -595,7 +678,7 @@
kmeans = kmeans_create (&qc);
kmeans_cluster (kmeans, group, &qc);
- quick_cluster_show_results (kmeans, &qc);
+ quick_cluster_show_results (kmeans, group, &qc);
kmeans_destroy (kmeans);
casereader_destroy (group);
}
qc.pdf
Description: Adobe PDF document
qc.sps
Description: application/spss-sps
case_id,cluster,x,y 1,1,10.45,9.38 2,1,10.67,9.17 3,1,10.86,9.63 4,1,8.77,8.45 5,1,8.04,11.77 6,1,10.34,9.83 7,1,10.37,10.54 8,1,11.49,8.18 9,1,10.17,11.10 10,1,11.37,9.16 11,1,10.25,8.83 12,1,8.69,9.92 13,1,10.36,10.39 14,1,10.89,10.51 15,1,9.91,11.39 16,1,11.11,10.91 17,1,11.77,8.47 18,1,9.51,10.46 19,1,10.67,9.33 20,1,9.74,8.85 21,2,-11.01,-9.21 22,2,-10.82,-11.76 23,2,-10.03,-10.29 24,2,-9.54,-9.17 25,2,-10.16,-9.82 26,2,-10.01,-8.63 27,2,-9.62,-10.22 28,2,-11.36,-10.93 29,2,-10.63,-10.97 30,2,-9.53,-10.78 31,2,-9.40,-10.26 32,2,-10.76,-9.76 33,2,-9.92,-10.11 34,2,-10.16,-9.75 35,2,-8.65,-11.31 36,2,-10.10,-10.90 37,2,-11.67,-9.89 38,2,-11.11,-9.23 39,2,-8.72,-8.43 40,2,-11.35,-8.68 41,3,-11.07,9.42 42,3,-10.20,9.00 43,3,-10.12,9.92 44,3,-10.41,10.16 45,3,-9.86,10.12 46,3,-10.31,10.12 47,3,-9.57,10.16 48,3,-9.69,9.93 49,3,-9.14,10.84 50,3,-9.83,10.19 51,3,-9.97,10.22 52,3,-11.65,10.81 53,3,-9.80,11.39 54,3,-10.31,10.74 55,3,-10.26,10.38 56,3,-11.57,10.02 57,3,-10.50,9.75 58,3,-9.06,9.63 59,3,-10.17,10.82 60,3,-10.22,9.99
_______________________________________________ pspp-dev mailing list [email protected] https://lists.gnu.org/mailman/listinfo/pspp-dev
