Attached is a patch to 0.8.5 quick-cluster.c to support the /PRINT=CLUSTER subcommand that identifies the cluster membership of each case in the analysis. I also included test syntax, data, and output.
John has suggested some stylistic changes that are included in this patch, but three last three suggestions are not in this patch: > I think this patch could be checked in if we also do the following: > > > 1. Add the necessary changes to doc/statistics.texi documenting the > /PRINT subcommand. > > 2. Add a test to tests/language/stats/quick-cluster.at which exercises > the /PRINT subcommand. > > 3. We should omit the syntax parsing code for the /SAVE command, because > it doesn't do anything yet. In addition, I've been looking over the regression.c code to see how it implements /SAVE but I'm not sure I can replicate it in the cluster code. -Alan On 7/27/2015 10:07 PM, Ben Pfaff wrote: > On Tue, Jul 28, 2015 at 04:47:51AM +0200, John Darrington wrote: >> On Mon, Jul 27, 2015 at 12:11:19PM -0500, Alan Mead wrote: >> >> In the attached files, the .patch is the patch for >> pspp-0.8.5/src/language/stats/quick-cluster.c (let me know if it's done >> incorrectly) >> >> It looks good to me. Just some minor quibbles about style: > It seems that I didn't receive the patch, and it doesn't seem to be in > the mailing list archive. Alan, perhaps you should re-send it? -- Alan D. Mead, Ph.D. President, Talent Algorithms Inc. science + technology = better workers +815.588.3846 (Office) +267.334.4143 (Mobile) http://www.alanmead.org Announcing the Journal of Computerized Adaptive Testing (JCAT), a peer-reviewed electronic journal designed to advance the science and practice of computerized adaptive testing: http://www.iacat.org/jcat
--- pspp-0.8.5-original/src/language/stats/quick-cluster.c 2015-05-30
18:13:57.000000000 -0500
+++ pspp-0.8.5/src/language/stats/quick-cluster.c 2015-07-27
07:59:42.455444022 -0500
@@ -60,6 +60,10 @@
int ngroups; /* Number of group. (Given by the user) */
int maxiter; /* Maximum iterations (Given by the user) */
+ int print_cluster_membership; /* true => print membership */
+ int print_initial_clusters; /* true => print initial cluster */
+ int save_cluster_membership; /* true => save cluster membership as QCL_1 in
dataset */
+ int save_cluster_distance; /* true => save cluster distance as QCL_2 in
dataset */
const struct variable *wv; /* Weighting variable. */
@@ -89,7 +93,7 @@
static struct Kmeans *kmeans_create (const struct qc *qc);
-static void kmeans_randomize_centers (struct Kmeans *kmeans, const struct qc
*qc);
+static void kmeans_randomize_centers (struct Kmeans *kmeans, const struct
casereader *reader, const struct qc *qc);
static int kmeans_get_nearest_group (struct Kmeans *kmeans, struct ccase *c,
const struct qc *);
@@ -104,9 +108,11 @@
static void quick_cluster_show_centers (struct Kmeans *kmeans, bool initial,
const struct qc *);
+static void quick_cluster_show_membership (struct Kmeans *kmeans, const struct
casereader *reader, const struct qc *);
+
static void quick_cluster_show_number_cases (struct Kmeans *kmeans, const
struct qc *);
-static void quick_cluster_show_results (struct Kmeans *kmeans, const struct qc
*);
+static void quick_cluster_show_results (struct Kmeans *kmeans, const struct
casereader *reader, const struct qc *);
int cmd_quick_cluster (struct lexer *lexer, struct dataset *ds);
@@ -152,7 +158,7 @@
/* Creates random centers using randomly selected cases from the data. */
static void
-kmeans_randomize_centers (struct Kmeans *kmeans, const struct qc *qc)
+kmeans_randomize_centers (struct Kmeans *kmeans, const struct casereader
*reader, const struct qc *qc)
{
int i, j;
for (i = 0; i < qc->ngroups; i++)
@@ -346,11 +352,12 @@
bool redo;
int diffs;
bool show_warning1;
+ int redo_count = 0;
show_warning1 = true;
cluster:
redo = false;
- kmeans_randomize_centers (kmeans, qc);
+ kmeans_randomize_centers (kmeans, reader, qc);
for (kmeans->lastiter = 0; kmeans->lastiter < qc->maxiter;
kmeans->lastiter++)
{
@@ -377,8 +384,13 @@
break;
}
}
+
if (redo)
- goto cluster;
+ {
+ redo_count++;
+ assert (redo_count < 10);
+ goto cluster;
+ }
}
@@ -446,6 +458,40 @@
tab_submit (t);
}
+/* Reports cluster membership for each case. */
+static void
+quick_cluster_show_membership (struct Kmeans *kmeans, const struct casereader
*reader, const struct qc *qc)
+{
+ struct tab_table *t;
+ int nc, nr;
+ int i, clust;
+ struct ccase *c;
+ struct casereader *cs = casereader_clone (reader);
+ nc = 2;
+ nr = kmeans->n + 1;
+ t = tab_create (nc, nr);
+ tab_headers (t, 0, nc - 1, 0, 0);
+ tab_title (t, _("Cluster Membership"));
+ tab_text (t, 0, 0, TAB_CENTER, _("Case Number"));
+ tab_text (t, 1, 0, TAB_CENTER, _("Cluster"));
+ tab_box (t, TAL_2, TAL_2, TAL_0, TAL_1, 0, 0, nc - 1, nr - 1);
+
+ for (i = 0; (c = casereader_read (cs)) != NULL; i++, case_unref (c))
+ {
+ assert (i < kmeans->n);
+ /* get cluster membership (does not reflect re-ordered centers) */
+ clust = kmeans_get_nearest_group (kmeans, c, qc);
+ /* make cluster membership reflect re-ordered centers */
+ clust = kmeans->group_order->data[clust];
+ tab_text_format (t, 0, i + 1, TAB_CENTER, "%d", (i + 1));
+ tab_text_format (t, 1, i + 1, TAB_CENTER, "%d", (clust + 1));
+ }
+ assert (i == kmeans->n);
+ tab_submit (t);
+ casereader_destroy (cs);
+}
+
+
/* Reports number of cases of each single cluster. */
static void
quick_cluster_show_number_cases (struct Kmeans *kmeans, const struct qc *qc)
@@ -479,13 +525,15 @@
/* Reports. */
static void
-quick_cluster_show_results (struct Kmeans *kmeans, const struct qc *qc)
+quick_cluster_show_results (struct Kmeans *kmeans, const struct casereader
*reader, const struct qc *qc)
{
- kmeans_order_groups (kmeans, qc);
- /* Uncomment the line below for reporting initial centers. */
- /* quick_cluster_show_centers (kmeans, true); */
+ kmeans_order_groups (kmeans, qc); /* what does this do? */
+ if( qc->print_initial_clusters )
+ quick_cluster_show_centers (kmeans, true, qc);
quick_cluster_show_centers (kmeans, false, qc);
quick_cluster_show_number_cases (kmeans, qc);
+ if( qc->print_cluster_membership )
+ quick_cluster_show_membership(kmeans, reader, qc);
}
int
@@ -499,6 +547,10 @@
qc.maxiter = 2;
qc.missing_type = MISS_LISTWISE;
qc.exclude = MV_ANY;
+ qc.print_cluster_membership = false; /* default = do not output case cluster
membership */
+ qc.print_initial_clusters = false; /* default = do not print initial
clusters */
+ qc.save_cluster_membership = false; /* default = do not save case cluster
membership */
+ qc.save_cluster_distance = false; /* default = do not save case cluster
distance */
if (!parse_variables_const (lexer, dict, &qc.vars, &qc.n_vars,
PV_NO_DUPLICATE | PV_NUMERIC))
@@ -536,6 +588,34 @@
goto error;
}
}
+ else if (lex_match_id (lexer, "SAVE"))
+ {
+ lex_match (lexer, T_EQUALS);
+ while (lex_token (lexer) != T_ENDCMD
+ && lex_token (lexer) != T_SLASH)
+ {
+ if (lex_match_id (lexer, "CLUSTER"))
+ qc.save_cluster_membership = true;
+ else if (lex_match_id (lexer, "DISTANCE"))
+ qc.save_cluster_distance = true;
+ else
+ goto error;
+ }
+ }
+ else if (lex_match_id (lexer, "PRINT"))
+ {
+ lex_match (lexer, T_EQUALS);
+ while (lex_token (lexer) != T_ENDCMD
+ && lex_token (lexer) != T_SLASH)
+ {
+ if (lex_match_id (lexer, "CLUSTER"))
+ qc.print_cluster_membership = true;
+ else if (lex_match_id (lexer, "INITIAL"))
+ qc.print_initial_clusters = true;
+ else
+ goto error;
+ }
+ }
else if (lex_match_id (lexer, "CRITERIA"))
{
lex_match (lexer, T_EQUALS);
@@ -600,7 +680,7 @@
kmeans = kmeans_create (&qc);
kmeans_cluster (kmeans, group, &qc);
- quick_cluster_show_results (kmeans, &qc);
+ quick_cluster_show_results (kmeans, group, &qc);
kmeans_destroy (kmeans);
casereader_destroy (group);
}
qc.sps
Description: application/spss-sps
case_id,cluster,x,y 1,1,10.45,9.38 2,1,10.67,9.17 3,1,10.86,9.63 4,1,8.77,8.45 5,1,8.04,11.77 6,1,10.34,9.83 7,1,10.37,10.54 8,1,11.49,8.18 9,1,10.17,11.10 10,1,11.37,9.16 11,1,10.25,8.83 12,1,8.69,9.92 13,1,10.36,10.39 14,1,10.89,10.51 15,1,9.91,11.39 16,1,11.11,10.91 17,1,11.77,8.47 18,1,9.51,10.46 19,1,10.67,9.33 20,1,9.74,8.85 21,2,-11.01,-9.21 22,2,-10.82,-11.76 23,2,-10.03,-10.29 24,2,-9.54,-9.17 25,2,-10.16,-9.82 26,2,-10.01,-8.63 27,2,-9.62,-10.22 28,2,-11.36,-10.93 29,2,-10.63,-10.97 30,2,-9.53,-10.78 31,2,-9.40,-10.26 32,2,-10.76,-9.76 33,2,-9.92,-10.11 34,2,-10.16,-9.75 35,2,-8.65,-11.31 36,2,-10.10,-10.90 37,2,-11.67,-9.89 38,2,-11.11,-9.23 39,2,-8.72,-8.43 40,2,-11.35,-8.68 41,3,-11.07,9.42 42,3,-10.20,9.00 43,3,-10.12,9.92 44,3,-10.41,10.16 45,3,-9.86,10.12 46,3,-10.31,10.12 47,3,-9.57,10.16 48,3,-9.69,9.93 49,3,-9.14,10.84 50,3,-9.83,10.19 51,3,-9.97,10.22 52,3,-11.65,10.81 53,3,-9.80,11.39 54,3,-10.31,10.74 55,3,-10.26,10.38 56,3,-11.57,10.02 57,3,-10.50,9.75 58,3,-9.06,9.63 59,3,-10.17,10.82 60,3,-10.22,9.99
test3.pdf
Description: Adobe PDF document
_______________________________________________ pspp-dev mailing list [email protected] https://lists.gnu.org/mailman/listinfo/pspp-dev
