Hi John, 1) I renamed the file as "quick-cluster.c"
2. I added an entry to "src/language/stats/automake.mk" for quick-cluster
3. I removed the entry "UNIMPL_CMD ("QUICK CLUSTER", "Fast clustering")" from
command.def file.
4. Now cmd_quick_cluster can parse a command line like:
QUICK CLUSTER x y z
/CRITERIA = CLUSTER(5) MXITER (100).
5. I removed the atoi() function, put
lex_force_int (lexer);
groups = lex_integer (lexer);
instead.
As
I mentioned, i test my results with random data with uniform
distributed random values. It can not be considered as a comprehensive
work and should be tested with simulations.
Regards.
Mehmet Hakan Satman
http://www.mhsatman.com
--- On Sun, 3/13/11, John Darrington <[email protected]> wrote:
From: John Darrington <[email protected]>
Subject: Re: K-Means Clustering
To: "Mehmet Hakan Satman" <[email protected]>
Cc: "John Darrington" <[email protected]>, [email protected]
Date: Sunday, March 13, 2011, 4:36 PM
Hi Mehmet,
Thanks for this. It seems to be basically working. There are a number of
improvements
that can be made however.
1. It'll be more consistent with the rest of PSPP if you call the new file
"quick-cluster.c" with a hyphen.
2. Instead of editing the Makefile, add the name of the new file to the
manifest in
src/language/stats/automake.mk
3. Can you remove the line UNIMPL_CMD ("QUICK CLUSTER", "Fast clustering")
Now the "quick cluster" command can parse these options in the pspp command
line:
quick cluster /VARIABLES=x y z /GROUPS=5 /MAXITER=100.
This is different to the syntax in the SPSS documentation which expects:
QUICK CLUSTER x y z
/CRITERIA = CLUSTER(5) MXITER (100).
where the /CRITERIA subcommand and each part thereof is optional. You can see
an example of how to
implement a /CRITERIA subcommand in src/language/stats/factor.c - in fact, you
may be able to copy much of that parser's code.
Avoid using atoi in the parser. Instead of groups=atoi(lex_tokcstr(lexer));
write :
lex_force_int (lexer);
groups = lex_integer (lexer);
i think a small development pdf documentation does not satisfies the needs of
implementing
something in PSPP.
You're right. The developer documentation is woefully incomplete.
You mentioned earlier that you had tested the results against spss. Do you have
the results
from these tests, and the test data that you used? I would be interested to
see this.
Best regards,
John
--
PGP Public key ID: 1024D/2DE827B3
fingerprint = 8797 A26D 0854 2EAB 0285 A290 8A67 719C 2DE8 27B3
See http://pgp.mit.edu or any PGP keyserver for public key.
/* PSPP - a program for statistical analysis. Copyright (C) 2009, 2010 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see <http://www.gnu.org/licenses/>. */ #include <config.h> #include <math.h> #include <libpspp/misc.h> #include <libpspp/str.h> #include <libpspp/message.h> #include <data/procedure.h> #include <data/missing-values.h> #include <data/casereader.h> #include <data/casegrouper.h> #include <data/dictionary.h> #include <data/format.h> #include <language/lexer/variable-parser.h> #include <language/command.h> #include <language/lexer/lexer.h> #include <math/moments.h> #include <output/tab.h> #include <output/text-item.h> #include <stdio.h> #include <gsl/gsl_matrix.h> #include <gsl/gsl_statistics.h> #include "gettext.h" #define _(msgid) gettext (msgid) #define N_(msgid) msgid struct quick_cluster { const char **varNames; int numGroups; }; /* Struct KMeans: Holds all of the information for the functions. */ struct Kmeans { gsl_matrix *data; //User Data (Given by the user) gsl_matrix *centers; //Centers for groups gsl_vector_int *index; //Integer values from zero to ngroups. Shows group of an observation. gsl_vector *v1,*v2,*v3; //Temporary vector for program. Do not use. int ngroups; //Number of group. (Given by the user) int n; //Number of observations. (Given by the user) int m; //Number of observations. (Given by the user) int maxiter; //Maximum number of iterations (Given by the user) int lastiter; //Show at which iteration it found the solution. int trials; //If not convergence, how many times has clustering done. double *weights; //Double values for handling weights for program use. }; struct Kmeans* kmeans_create(double* data, int n, int m, int ngroups, int maxiter){ int i,j; struct Kmeans *k = (struct Kmeans*) malloc(sizeof(struct Kmeans)); k->data=gsl_matrix_alloc(n,m); k->centers=gsl_matrix_alloc(ngroups, m); k->ngroups=ngroups; k->index=gsl_vector_int_alloc(n); k->n=n; k->m=m; k->maxiter=maxiter; k->lastiter=0; k->trials=0; for (i=0;i<n;i++){ for(j=0;j<m;j++){ gsl_matrix_set(k->data, i, j, data[i * m +j]); } } k->weights = (double*)malloc(sizeof(double) * k->index->size); k->v1 = gsl_vector_alloc(k->centers->size2); k->v2 = gsl_vector_alloc(k->centers->size2); k->v3 = gsl_vector_alloc(k->n); return(k); } void kmeans_randomize_centers(struct Kmeans *kmeans){ int i,j; int randind; double min=0,max=0; min=gsl_matrix_min(kmeans->data); max=gsl_matrix_max(kmeans->data); for (i=0;i<kmeans->centers->size1;i++){ randind=(int)((((double)rand())/RAND_MAX)*kmeans->data->size1); for (j=0;j<kmeans->centers->size2; j++){ //gsl_matrix_set(kmeans->centers, i, j, min + (((double)rand())/RAND_MAX)*(max-min)); gsl_matrix_set(kmeans->centers,i,j, gsl_matrix_get(kmeans->data, randind,j)); } } } void kmeans_randomize_index(struct Kmeans *kmeans){ int i; for (i=0;i<kmeans->index->size;i++){ kmeans->index->data[i]=-1; } } void kmeans_print(struct Kmeans* kmeans){ int i,j; printf("Number of groups: %d\n",kmeans->ngroups); printf("Centers:\n"); for (i=0;i<kmeans->centers->size1;i++) { for (j=0;j<kmeans->centers->size2;j++){ printf("%f ",gsl_matrix_get(kmeans->centers, i,j)); } printf("\n"); } printf("Index:\n"); for (i=0;i<kmeans->n;i++){ printf("%d ",gsl_vector_int_get(kmeans->index, i)); } printf("\nLast iter: %d\n",kmeans->lastiter); } void print_matrix(gsl_matrix *m){ int i,j; for (i=0;i<m->size1;i++){ for (j=0;j<m->size2;j++){ printf("%f ",m->data[i * m->size2 + j]); } printf("\n"); } } double kmeans_euclidean_distance(gsl_vector *v1, gsl_vector *v2){ double result=0.0; double val; int i; for (i=0;i<v1->size;i++){ val=v1->data[i] - v2->data[i]; result+=val*val; } return(result); } int kmeans_num_elements_group(struct Kmeans *kmeans, int group){ int total=0; int i; for (i=0;i<kmeans->n;i++){ if(gsl_vector_int_get(kmeans->index,i)==group) total++; } return(total); } void kmeans_recalculate_centers(struct Kmeans *kmeans){ int i,j,h; int elm; double mean; gsl_vector *v1=kmeans->v3; for (i=0;i<kmeans->ngroups;i++){ elm=kmeans_num_elements_group(kmeans,i); for (j=0;j<kmeans->index->size;j++){ if(gsl_vector_int_get(kmeans->index,j)==i){ kmeans->weights[j]=1.0; }else{ kmeans->weights[j]=0.0; } } for (h=0;h<kmeans->m;h++){ gsl_matrix_get_col(v1,kmeans->data, h); mean=gsl_stats_wmean(kmeans->weights, 1, v1->data ,1, v1->size); gsl_matrix_set(kmeans->centers, i,h, mean); } } } void kmeans_calculate_indexes(struct Kmeans *kmeans){ double dist; double mindist; int bestindex=0; int i,j; gsl_vector *v1 = kmeans->v1; gsl_vector *v2 = kmeans->v2; for (i=0;i<kmeans->n; i++){ mindist=INFINITY; gsl_matrix_get_row(v1, kmeans->data, i); for (j=0;j<kmeans->ngroups; j++){ gsl_matrix_get_row(v2, kmeans->centers,j); dist=kmeans_euclidean_distance(v1,v2); if(dist<mindist){ mindist=dist; bestindex=j; } } gsl_vector_int_set(kmeans->index,i,bestindex); } } int kmeans_check_converge(gsl_vector_int *current, gsl_vector_int *old, struct Kmeans *kmeans){ int i; int total=0; for (i=0;i<current->size;i++) { if(current->data[i] == old->data[i]) total++; old->data[i]=current->data[i]; } return(current->size-total); } gsl_matrix* kmeans_getGroup(struct Kmeans *kmeans, int index){ int i; int j=0; int elm=kmeans_num_elements_group(kmeans,index); gsl_matrix *agroup=gsl_matrix_alloc(elm, kmeans->data->size2); gsl_vector *v1=gsl_vector_alloc(kmeans->data->size2); for(i=0;i<kmeans->data->size1;i++){ if(kmeans->index->data[i]==index){ gsl_matrix_get_row(v1, kmeans->data, i); gsl_matrix_set_row(agroup, j, v1); j++; } } return(agroup); } void kmeans_cluster(struct Kmeans *kmeans){ int i; double *ind; double sum; bool redo; gsl_vector_int *oldindex = gsl_vector_int_alloc(kmeans->index->size); cluster: redo=false; kmeans_randomize_centers(kmeans); for (kmeans->lastiter=0;kmeans->lastiter<kmeans->maxiter;kmeans->lastiter++){ kmeans_calculate_indexes(kmeans); kmeans_recalculate_centers(kmeans); if(kmeans_check_converge(kmeans->index, oldindex, kmeans)==0) break; } for (i=0;i<kmeans->ngroups;i++){ if(kmeans_num_elements_group(kmeans,i)==0) { kmeans->trials++; redo=true; break; } } if(redo) goto cluster; } void quick_cluster_show_results(struct Kmeans* kmeans){ int i,j; printf("Number of cases: %d\n",kmeans->n); printf("Number of variables: %d\n",kmeans->m); printf("Number of groups: %d\n",kmeans->ngroups); printf("Number of trials: %d\n",(kmeans->trials+1)); printf("Number of iterations at last trial: %d\n",(kmeans->lastiter+1)); printf("Centers:\n"); for (i=0;i<kmeans->centers->size1;i++){ printf("Center of Group %d: ",(i+1)); for(j=0;j<kmeans->centers->size2;j++){ printf("%0.3f ",kmeans->centers->data[i * kmeans->centers->size2 + j]); } printf("\n"); } printf("Data Index:\n"); for (int i=0;i<kmeans->index->size;i++){ printf("%3d ",kmeans->index->data[i]+1); } printf("\n"); } int cmd_quick_cluster (struct lexer *lexer, struct dataset *ds) { struct Kmeans *kmeans; double* data; struct ccase *c; bool ok; struct dictionary *dict = dataset_dict(ds); int n; struct variable **variables; struct casereader *input, *inputnew; int groups=0; int numobs=0; int maxiter=0; struct quick_cluster *qc=(struct quick_cluster*)malloc(sizeof(struct quick_cluster)); int i,j; parse_variables_const (lexer, dict, &variables, &n ,PV_NO_DUPLICATE | PV_NUMERIC); if(lex_match (lexer, T_SLASH)){ if (lex_match_id (lexer, "CRITERIA")) { lex_match (lexer, T_EQUALS); while (lex_token (lexer) != T_ENDCMD && lex_token (lexer) != T_SLASH) { if (lex_match_id (lexer, "CLUSTERS")) { if ( lex_force_match (lexer, T_LPAREN)) { lex_force_int (lexer); groups = lex_integer (lexer); lex_get (lexer); lex_force_match (lexer, T_RPAREN); } }else if(lex_match_id(lexer, "MXITER")){ if ( lex_force_match (lexer, T_LPAREN)) { lex_force_int (lexer); maxiter = lex_integer (lexer); lex_get (lexer); lex_force_match (lexer, T_RPAREN); } }else{ printf("Error parsing command.\n"); return(CMD_FAILURE); } } } } inputnew=proc_open (ds); numobs=casereader_count_cases(inputnew); if(groups>numobs){ printf("Number of groups cannot be larger than the number of cases.\n"); ok = casereader_destroy (inputnew); proc_commit (ds); return(CMD_FAILURE); } data=(double*)malloc(numobs * n * sizeof(double)); i=0;j=0; for (; (c = casereader_read (inputnew)) != NULL; case_unref (c)) { int v; double x; for (v = 0; v < n; ++v) { x = case_data (c, variables[v])->f; data[i * n + v] = x; } i++; } ok = casereader_destroy (inputnew); ok = proc_commit (ds) && ok; kmeans=kmeans_create(data, numobs, n, groups, maxiter); kmeans_cluster(kmeans); quick_cluster_show_results(kmeans); return(ok); }
command.def
Description: Binary data
automake.mk
Description: Binary data
_______________________________________________ pspp-dev mailing list [email protected] http://lists.gnu.org/mailman/listinfo/pspp-dev
