Hans-Joerg Bibiko's function Levenshtein would help; cf. below for an
example (very clumsy with two loops, but you can tweak that with apply
stuff).
HTH,
STG


levenshtein <- function(string1, string2, case=TRUE, map=NULL) {
        ########
        # levenshtein algorithm in R
        #
        # Author  : Hans-Joerg Bibiko
        # Date    : 29/06/2006
        # Contact : bib...@eva.mpg.de
        ########
        # string1, string2 := strings to compare
        # case = TRUE := case sensitivity; case = FALSE := case insensitivity
        # map := character vector of c(regexp1, replacement1, regexp2,
replacement2, ...)
        #   example:
        #      map <- c("[aeiou]","V","[^aeiou]","C") := replaces all vowels
with V and all others with C
        #      levenshtein("Bank","Bond", map=map)   =>  0
        ########
        
        if(!is.null(map)) {
                m <- matrix(map, ncol=2, byrow=TRUE)
                s <- c(ifelse(case, string1, tolower(string1)), ifelse(case,
string2, tolower(string2)))
                for(i in 1:dim(m)[1]) s <- gsub(m[i,1], m[i,2], s)
                string1 <- s[1]
                string2 <- s[2]
        }

        if(ifelse(case, string1, tolower(string1)) == ifelse(case, string2,
tolower(string2))) return(0)

        s1 <- strsplit(paste(" ", ifelse(case, string1, tolower(string1)),
sep=""), NULL)[[1]]
        s2 <- strsplit(paste(" ", ifelse(case, string2, tolower(string2)),
sep=""), NULL)[[1]]
        
        l1 <- length(s1)
        l2 <- length(s2)
        
        d <- matrix(nrow = l1, ncol = l2)

        for(i in 1:l1) d[i,1] <- i-1
        for(i in 1:l2) d[1,i] <- i-1
        for(i in 2:l1) for(j in 2:l2) d[i,j] <- min((d[i-1,j]+1) ,
(d[i,j-1]+1) , (d[i-1,j-1]+ifelse(s1[i] == s2[j], 0, 1)))
        
        d[l1,l2]
} # end of function Hans-Joerg Bibiko's levenshtein

# generate names
set.seed(1)
all.names<-character(10)
for (i in 1:10) {
   all.names[i]<-paste(sample(letters, sample(4:10, 1), replace=T), collapse="")
}
all.names

# generate matrix
sims<-matrix(0, nrow=10, ncol=10)
attr(sims, "dimnames")<-list(all.names, all.names)

# fill matrix (clumsy)
for (j in 1:9) {
   for (k in (j+1):10) {
      sims[j,k]<-sims[k,j]<-levenshtein(all.names[j], all.names[k])
   }
}
plot(hclust(as.dist(sims)))

______________________________________________
R-help@r-project.org mailing list
https://stat.ethz.ch/mailman/listinfo/r-help
PLEASE do read the posting guide http://www.R-project.org/posting-guide.html
and provide commented, minimal, self-contained, reproducible code.

Reply via email to