Hi all, 
Please find below my take on the hac algorithm. I'd like to hear how I could 
improve on it. 
Especially the get-closest-pair function is ugly. I also don't like that I need 
transform the cluster to something vijual can draw. 
It would be much nicer to simply represent the tree as a nested vector (or 
list) but then I can't think of a way to do efficient removal of clusters.
Cheers
Andreas




(ns clj-sentiment.clustering.hac
  (:require [clj-sentiment.core :as core]))

;;; A bunch of sparse vector functions. Sparse vectors are represented as maps.

(defn div [v n]
  "Divide sparse vec v by n"
  (reduce (fn [m [k val]] (assoc m k (/ val n))) {} v))

(defn sum [a b]
  "Component wise sum of a and b"
  (merge-with + a b))

(defn average [& vecs]
  "Calculate the average over vecs."
  (let [n (count vecs)]
    (reduce sum (map #(div % n) vecs))))

(defn diff-squared [a b]
  "Component wise difference of a and b"
  (merge-with (fn [a b] (let [diff (- a b)] (* diff diff))) a b))

(defn eucl-dist [v1 v2]
  (Math/sqrt (reduce + (vals  (diff-squared v1 v2)))))

;;; Enough sparse vector stuff ... Below is the actual algorighm...

;; "Uses memoization to remember distance calculations."
(def distance
  (memoize (fn [x y metric] (metric x y))))

(defn get-closest-pair [l metric]
  "Gets the closest vector pair according to metric."
  (first (sort-by peek
                  (map (fn [[id1 vec1 id2 vec2]]
                         [id1 id2 (distance vec1 vec2 metric)])
                       (for [[id1 cl1 :as a] l [id2 cl2 :as b] (rest l) :when 
(not= a b)] [id1 (:vec cl1) id2 (:vec cl2)])))))

(defn create-cluster [{label :id :as document} id]
  (hash-map id {:label label :vec (core/get-feature-vec document) :left nil 
:right nil :dist nil}))

(defn create-initial-clusters [l]
  "Initially, there is one cluster per document. Clusters are stored in a map 
identified by their id for fast lookup."
  (apply merge (map create-cluster l (iterate inc 1))))

(defn hac [l metric]
  "Hierarchical agglomerative clustering algorithm."
  (loop [clust (create-initial-clusters l) id -1]
    (if (<= (count clust) 1) clust
        (let [[idi idj dist] (get-closest-pair clust metric)
              clusti (clust idi)
              clustj (clust idj)
              mergevec (average (:vec clusti) (:vec clustj))
              newclust {:left {idi clusti} :right {idj clustj} :dist dist :vec 
mergevec}]
          (recur (-> clust (dissoc idi) (dissoc idj) (assoc id newclust)) (dec 
id))))))

(defn tree-vis [[id {l :left r :right label :label} :as node] acc]
  (cond (nil? node) acc
        (and (nil? l) (nil? r)) label
        :else (conj acc (tree-vis (first r) acc) (tree-vis (first l) acc) '*)))

(vijual/draw-binary-tree (tree-vis (first (hac/hac l hac/eucl-dist)) ()))

             +---+
              |  * |
             +---+
             /     \___      
            /              \     
       +---+        +---+
       | * |            | *   |
       +---+        +---+
      /     \              /     \      
     /       \            /          \     
+---+      +---+  +---+     +---+
| D |        |  E |   | C |       | *  |
+---+      +---+  +---+     +---+
                                         /     \      
                                        /       \     
                                   +---+      +---+
                                   | A  |         | B |
                                   +---+       +---+

-- 
You received this message because you are subscribed to the Google
Groups "Clojure" group.
To post to this group, send email to clojure@googlegroups.com
Note that posts from new members are moderated - please be patient with your 
first post.
To unsubscribe from this group, send email to
clojure+unsubscr...@googlegroups.com
For more options, visit this group at
http://groups.google.com/group/clojure?hl=en

Reply via email to