--- kmeans.m	2012-04-19 23:34:49.442411501 -0500
+++ /home/daniel/Documents/School/CSCI 6653 Spr 12/test2/kmeans.m	2012-04-19 23:45:17.234427575 -0500
@@ -1,20 +1,41 @@
-## Copyright (C) 2011 Soren Hauberg <soren@hauberg.org>
+## Copyright (C) 2011 Soren Hauberg
 ##
-## This program is free software; you can redistribute it and/or modify it under
-## the terms of the GNU General Public License as published by the Free Software
-## Foundation; either version 3 of the License, or (at your option) any later
-## version.
+## This program is free software; you can redistribute it and/or modify
+## it under the terms of the GNU General Public License as published by
+## the Free Software Foundation; either version 3 of the License, or
+## (at your option) any later version.
+## 
+## This program is distributed in the hope that it will be useful,
+## but WITHOUT ANY WARRANTY; without even the implied warranty of
+## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+## GNU General Public License for more details.
+## 
+## You should have received a copy of the GNU General Public License
+## along with this program; if not, write to the Free Software
+## Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ##
-## This program is distributed in the hope that it will be useful, but WITHOUT
-## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
-## FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
-## details.
-##
-## You should have received a copy of the GNU General Public License along with
-## this program; if not, see <http://www.gnu.org/licenses/>.
+## Additional features added by Daniel Ward April 19, 2012
 
-function [classes, centers] = kmeans (data, k, varargin)
-  ## Input checking
+function [classes, centers, sumd, D] = kmeans (data, k, varargin)
+  [reg, prop] = parseparams(varargin);
+  
+  #defaults for options
+  emptyaction = 'error';
+  start = "sample";
+  
+  #used for getting the number of samples
+  [N, D] = size (data);
+  
+  #used to hold the distances from each sample to each class
+  D = zeros (N, k);
+  
+  #used for convergence of the centroids
+  err = 1;
+  
+  #initial sum of distances
+  sumd = Inf;
+  
+  ## Input checking, validate the matrix and k
   if (!ismatrix (data) || !isreal (data))
     error ("kmeans: first input argument must be a DxN real data matrix");
   endif
@@ -22,14 +43,20 @@
     error ("kmeans: second input argument must be a scalar");
   endif
   
-  [N, D] = size (data);
-  
-  ## (so far) Harcoded options
-  maxiter = Inf;
-  start = "sample";
+  if(length(varargin) > 0)
+	  ## check for the 'emptyaction' property
+	  found = find(strcmpi(prop,'emptyaction') == 1);
+	  switch(lower(prop{found+1}))
+	  	case 'singleton'
+	  		emptyaction = 'singleton';
+	  	otherwise
+	  		error ("kmeans: unsupported empty cluster action parameter");
+	  endswitch
+  endif
   
-  ## Find initial clusters
-  switch (lower (start))
+  ## check for the 'start' property
+  #found = find(strcmpi(prop,'start') == 1);
+  switch (lower(start))
     case "sample"
       idx = randperm (N) (1:k);
       centers = data (idx, :);
@@ -37,33 +64,73 @@
       error ("kmeans: unsupported initial clustering parameter");
   endswitch
   
+ 
   ## Run the algorithm
-  D = zeros (N, k);
-  iterations = 0;
-  prevcenters = centers;
-  while (true)
+  while err > .001
     ## Compute distances
     for i = 1:k
-      D (:, i) = sum (( data - repmat (centers (i, :), N, 1)).^2, 2);
+      D (:, i) = sumsq(data - repmat(centers(i, :), N, 1), 2);
     endfor
     
     ## Classify
     [tmp, classes] = min (D, [], 2);
     
-    ## Recompute centers
+    ## Calcualte new centroids
     for i = 1:k
+  
+  		## Check for empty clusters
+    	if sum(classes==i)==0 || length(mean(data(classes == i, :))) == 0
+    	
+    		switch(emptyaction)
+    			## if 'singleton', then find the point that is the
+    			## farthest and add it to the empty cluster
+		 		case 'singleton'
+		 			classes(maxCostSampleIndex(data,centers(i,:))) = i;
+		 			
+		 		## if 'error' then throw the error
+		 		otherwise
+		 			error ("kmeans: empty cluster created");
+    		endswitch
+    		
+      endif ## end check for empty clusters
+      
+      ## update the centroids
       centers (i, :) = mean (data (classes == i, :));
+      
     endfor
     
-    ## Check for convergence
-    iterations++;
-    if (all (centers (:) == prevcenters (:)) || iterations >= maxiter)
-      break;
-    endif
-    prevcenters = centers;
-  endwhile
+    ## calculate the differnece in the sum of distances
+    err = sumd - objCost(data,classes,centers);
+    ## update the current sum of distances
+    sumd = objCost(data,classes,centers);
+  endwhile  
 endfunction
 
+## calculate the sum of distances
+function obj = objCost(data,classes,centers)
+	[N D] = size(data);
+	sum = 0;
+  	
+  	for i=1:N
+  		sum = sum + sumsq(data(i,:)-centers(classes(i),:));
+  	endfor
+  	
+  	obj = sum;
+endfunction
+
+function index = maxCostSampleIndex(data,centers)
+	[n m] = size(data); 
+	cost = 0;
+	index = 1;
+	for j = 1:n
+		if cost < sumsq(data(j,:) - centers)
+			cost = sumsq(data(j,:) - centers);
+			index = j;
+		endif
+	endfor		 			
+endfunction
+
+
 %!demo
 %! ## Generate a two-cluster problem
 %! C1 = randn (100, 2) + 1;
