I committed linkage.m, which should now be completely compatible with Matlab's. Here it is:
===File ~/gnu/octave-forge/main/statistics/inst/linkage.m=== ## Copyright (C) 2008 Francesco Potortì <[EMAIL PROTECTED]> ## ## This software is free software; you can redistribute it and/or modify ## it under the terms of the GNU General Public License as published by ## the Free Software Foundation; either version 3, or (at your option) ## any later version. ## ## This software is distributed in the hope that it will be useful, but ## WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ## General Public License for more details. ## ## You should have received a copy of the GNU General Public License ## along with this software; see the file COPYING. If not, see ## <http://www.gnu.org/licenses/>. ## -*- texinfo -*- ## @deftypefn {Function File} [EMAIL PROTECTED] =} linkage (@var{d}) ## @deftypefnx {Function File} [EMAIL PROTECTED] =} linkage (@var{d}, @var{method}) ## @deftypefnx {Function File} @ ## [EMAIL PROTECTED] =} linkage (@var{x}, @var{method}, @var{metric}) ## @deftypefnx {Function File} @ ## [EMAIL PROTECTED] =} linkage (@var{x}, @var{method}, @var{arglist}) ## ## Produce a hierarchical clustering dendrogram ## ## @var{d} is the dissimilarity matrix relative to @var{n} observations, ## formatted as a @math{(n-1)*n/2}x1 vector as produced by @code{pdist}. ## Alternatively, @var{x} contains data formatted for input to ## @code{pdist}, @var{metric} is a metric for @code{pdist} and ## @var{arglist} is a cell array containing arguments that are passed to ## @code{pdist}. ## ## @code{linkage} starts by putting each observation into a singleton ## cluster and numbering those from 1 to @var{n}. Then it merges two ## clusters, chosen according to @var{method}, to create a new cluster ## numbered @var{n+1}, and so on until all observations are grouped into ## a single cluster numbered @var{2*n-1}. Row @var{m} of the ## @math{m-1}x3 output matrix relates to cluster @math{n+m}: the first ## two columns are the numbers of the two component clusters and column ## 3 contains their distance. ## ## @var{method} defines the way the distance between two clusters is ## computed and how they are recomputed when two clusters are merged: ## ## @table @samp ## @item "single" (default) ## Distance between two clusters is the minimum distance between two ## elements belonging each to one cluster. Produces a cluster tree ## known as minimum spanning tree. ## ## @item "complete" ## Furthest distance between two elements belonging each to one cluster. ## ## @item "average" ## Unweighted pair group method with averaging (UPGMA). ## The mean distance between all pair of elements each belonging to one ## cluster. ## ## @item "weighted" ## Weighted pair group method with averaging (WPGMA). ## When two clusters A and B are joined together, the new distance to a ## cluster C is the mean between distances A-C and B-C. ## ## @item "centroid" ## Unweighted Pair-Group Method using Centroids (UPGMC). ## Assumes Euclidean metric. The distance between cluster centroids, ## each centroid being the center of mass of a cluster. ## ## @item "median" ## Weighted pair-group method using centroids (WPGMC). ## Assumes Euclidean metric. Distance between cluster centroids. When ## two clusters are joined together, the new centroid is the midpoint ## between the joined centroids. ## ## @item "ward" ## Ward's sum of squared deviations about the group mean (ESS). ## Also known as minimum variance or inner squared distance. ## Assumes Euclidean metric. How much the moment of inertia of the ## merged cluster exceeds the sum of those of the individual clusters. ## @end table ## ## @strong{Reference} ## Ward, J. H. Hierarchical Grouping to Optimize an Objective Function ## J. Am. Statist. Assoc. 1963, 58, 236-244, ## @url{http://iv.slis.indiana.edu/sw/data/ward.pdf}. ## @end deftypefn ## ## @seealso{pdist,squareform} ## Author: Francesco Potortì <[EMAIL PROTECTED]> function dgram = linkage (d, method = "single", distarg) ## check the input if (nargin < 1) || (nargin > 3) print_usage (); endif if (isempty (d)) error ("linkage: d cannot be empty"); elseif ( nargin < 3 && ~ isvector (d)) error ("linkage: d must be a vector"); endif methods = struct ... ("name", { "single"; "complete"; "average"; "weighted"; "centroid"; "median"; "ward" }, "distfunc", {(@(x) min(x)) # single (@(x) max(x)) # complete (@(x,i,j,w) sum(dmult(q=w([i,j]),x))/sum(q)) # average (@(x) mean(x)) # weighted (@massdist) # centroid (@(x,i) massdist(x,i)) # median (@inertialdist) # ward }); mask = strcmp (lower (method), {methods.name}); if (! any (mask)) error ("linkage: %s: unknown method", method); endif dist = {methods.distfunc}{mask}; if (nargin == 3) if (ischar (distarg)) d = pdist (d, distarg); elseif (iscell (distarg)) d = pdist (d, distarg{:}); else print_usage (); endif endif d = squareform (d, "tomatrix"); # dissimilarity NxN matrix n = rows (d); # the number of observations diagidx = sub2ind ([n,n], 1:n, 1:n); # indices of diagonal elements d(diagidx) = Inf; # consider a cluster as far from itself ## For equal-distance nodes, the order in which clusters are ## merged is arbitrary. Rotating the initial matrix produces an ## ordering similar to Matlab's. cname = n:-1:1; # cluster names in d d = rot90 (d, 2); # exchange low and high cluster numbers weight = ones (1, n); # cluster weights dgram = zeros (n-1, 3); # clusters from n+1 to 2*n-1 for cluster = n+1:2*n-1 ## Find the two nearest clusters [m midx] = min (d(:)); [r, c] = ind2sub (size (d), midx); ## Here is the new cluster dgram(cluster-n, :) = [cname(r) cname(c) d(r, c)]; ## Put it in place of the first one and remove the second cname(r) = cluster; cname(c) = []; ## Compute the new distances newd = dist (d([r c], :), r, c, weight); newd(r) = Inf; # take care of the diagonal element ## Put distances in place of the first ones, remove the second ones d(r,:) = newd; d(:,r) = newd'; d(c,:) = []; d(:,c) = []; ## The new weight is the sum of the components' weights weight(r) += weight(c); weight(c) = []; endfor ## Sort the cluster numbers, as Matlab does dgram(:,1:2) = sort (dgram(:,1:2), 2); ## Check that distances are monotonically increasing if (any (diff (dgram(:,3)) < 0)) warning ("clustering", "linkage: cluster distances do not monotonically increase\n\ you should probably use a method different from \"%s\"", method); endif endfunction ## Take two row vectors, which are the Euclidean distances of clusters I ## and J from the others. Column I of second row contains the distance ## between clusters I and J. The centre of gravity of the new cluster ## is on the segment joining the old ones. W are the weights of all ## clusters. Use the law of cosines to find the distances of the new ## cluster from all the others. function y = massdist (x, i, j, w) x .^= 2; # squared Euclidean distances c2 = x(2, i); # squared distance between I and J if (nargin < 4) # median distance qi = 0.5; # equal weights ("weighted") else # centroid distance qi = 1 / (1 + w(j) / w(i)); # the cluster weights endif y = sqrt (qi*x(1,:) + (1-qi)*(x(2,:) - qi*c2)); endfunction ## Take two row vectors, which are the inertial distances of clusters I ## and J from the others. Column I of second row contains the inertial ## distance between clusters I and J. The centre of gravity of the new ## cluster K is on the segment joining I and J. W are the weights of ## all clusters. Convert inertial to Euclidean distances, then use the ## law of cosines to find the Euclidean distances of K from all the ## other clusters, convert them back to inertial distances and return ## them. function y = inertialdist (x, i, j, w) x .^= 2; # squared inertial distances wi = w(i); wj = w(j); # the cluster weights sij = wi + wj; # sum of weights of I and J c2 = x(2,i) * sij / wi / wj; # squared Eucl. dist. between I and J s = [wi + w; wj + w]; # sum of weights for all cluster pairs p = [wi * w; wj * w]; # product of weights for all cluster pairs x .*= s ./ p; # convert inertial dist. to squared Eucl. qi = wi/sij; # normalise the weights of I and J ## Squared Euclidean distances between all clusters and new cluster K x = qi*x(1,:) + (1-qi)*(x(2,:) - qi*c2); y = sqrt (x * sij .* w ./ (sij + w)); # convert Eucl. dist. to inertial endfunction %!shared x, t %! x = reshape(mod(magic(6),5),[],3); %! t = 1e-6; %!assert (cond (linkage (pdist (x))), 34.119045, t); %!assert (cond (linkage (pdist (x), "complete")), 21.793345, t); %!assert (cond (linkage (pdist (x), "average")), 27.045012, t); %!assert (cond (linkage (pdist (x), "weighted")), 27.412889, t); %!warning <monotonically> linkage (pdist (x), "centroid"); %!test warning off clustering %! assert (cond (linkage (pdist (x), "centroid")), 27.457477, t); %! warning on clustering %!warning <monotonically> linkage (pdist (x), "median"); %!test warning off clustering %! assert (cond (linkage (pdist (x), "median")), 27.683325, t); %! warning on clustering %!assert (cond (linkage (pdist (x), "ward")), 17.195198, t); %!assert (cond (linkage(x,"ward","euclidean")), 17.195198, t); %!assert (cond (linkage(x,"ward",{"euclidean"})), 17.195198, t); %!assert (cond (linkage(x,"ward",{"minkowski", 2})),17.195198, t); ============================================================ -- Francesco Potortì (ricercatore) Voice: +39 050 315 3058 (op.2111) ISTI - Area della ricerca CNR Fax: +39 050 315 2040 via G. Moruzzi 1, I-56124 Pisa Email: [EMAIL PROTECTED] (entrance 20, 1st floor, room C71) Web: http://fly.isti.cnr.it/ ------------------------------------------------------------------------- This SF.Net email is sponsored by the Moblin Your Move Developer's challenge Build the coolest Linux based applications with Moblin SDK & win great prizes Grand prize is a trip for two to an Open Source event anywhere in the world http://moblin-contest.org/redirect.php?banner_id=100&url=/ _______________________________________________ Octave-dev mailing list [email protected] https://lists.sourceforge.net/lists/listinfo/octave-dev
