This is what I have done, is there a better way of doing this? val df = spark.read.option("header", "false").csv("data")
val tk = new Tokenizer().setInputCol("_c2").setOutputCol("words") val tf = new HashingTF().setInputCol("words").setOutputCol("tf") val idf = new IDF().setInputCol("tf").setOutputCol("tf-idf") val df1 = tf.transform(tk.transform(df)) val idfs = idf.fit(df1).transform(df1) println(nearestNeighbour("http://dbpedia.org/resource/Barack_Obama", idfs)) def nearestNeighbour(uri: String, ds: DataFrame) : String = { var res : Row = null var metric : Double = 0 val tfIdfSrc = ds.filter(s"_c0 == '$uri'").take(1)(0).getAs[Vector]("tf-idf") ds.filter("_c0 != '" + uri + "'").foreach { r => val tfIdfDst = r.getAs[Vector]("tf-idf") val dp = dorProduct(tfIdfSrc, tfIdfDst) if (dp > metric) { res = r metric = dp } } return res.getAs[String]("_c1") } def cosineSimilarity(vectorA: Vector, vectorB: Vector) = { var dotProduct = 0.0 var normA = 0.0 var normB = 0.0 var index = vectorA.size - 1 for (i <- 0 to index) { dotProduct += vectorA(i) * vectorB(i) normA += Math.pow(vectorA(i), 2) normB += Math.pow(vectorB(i), 2) } (dotProduct / (Math.sqrt(normA) * Math.sqrt(normB))) } def dorProduct(vectorA: Vector, vectorB: Vector) = { var dp = 0.0 var index = vectorA.size - 1 for (i <- 0 to index) { dp += vectorA(i) * vectorB(i) } dp } On Sun, Nov 13, 2016 at 7:04 PM, Meeraj Kunnumpurath < mee...@servicesymphony.com> wrote: > Hello, > > I have a dataset containing TF-IDF vectors for a corpus of documents. How > do I perform a nearest neighbour search on the dataset, using cosine > similarity? > > val df = spark.read.option("header", "false").csv("data") > > val tk = new Tokenizer().setInputCol("_c2").setOutputCol("words") > > val tf = new HashingTF().setInputCol("words").setOutputCol("tf") > > val idf = new IDF().setInputCol("tf").setOutputCol("tf-idf") > > val df1 = tf.transform(tk.transform(df)) > > idf.fit(df1).transform(df1).select("tf-idf").show(10) > Thank you > > -- > *Meeraj Kunnumpurath* > > > *Director and Executive PrincipalService Symphony Ltd00 44 7702 693597* > > *00 971 50 409 0169mee...@servicesymphony.com <mee...@servicesymphony.com>* > -- *Meeraj Kunnumpurath* *Director and Executive PrincipalService Symphony Ltd00 44 7702 693597* *00 971 50 409 0169mee...@servicesymphony.com <mee...@servicesymphony.com>*