[ https://issues.apache.org/jira/browse/SPARK-5261?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
Guoqiang Li updated SPARK-5261: ------------------------------- Description: Get data: {code:none} normalize_text() { awk '{print tolower($0);}' | sed -e "s/’/'/g" -e "s/′/'/g" -e "s/''/ /g" -e "s/'/ ' /g" -e "s/“/\"/g" -e "s/”/\"/g" \ -e 's/"/ " /g' -e 's/\./ \. /g' -e 's/<br \/>/ /g' -e 's/, / , /g' -e 's/(/ ( /g' -e 's/)/ ) /g' -e 's/\!/ \! /g' \ -e 's/\?/ \? /g' -e 's/\;/ /g' -e 's/\:/ /g' -e 's/-/ - /g' -e 's/=/ /g' -e 's/=/ /g' -e 's/*/ /g' -e 's/|/ /g' \ -e 's/«/ /g' | tr 0-9 " " } wget http://www.statmt.org/wmt14/training-monolingual-news-crawl/news.2013.en.shuffled.gz gzip -d news.2013.en.shuffled.gz normalize_text < news.2013.en.shuffled > data.txt {code} {code:none} import org.apache.spark.mllib.feature.Word2Vec val text = sc.textFile("dataPath").map { t => t.split(" ").toIterable } val word2Vec = new Word2Vec() word2Vec. setVectorSize(100). setSeed(42L). setNumIterations(5). setNumPartitions(36). setMinCount(5) val model = word2Vec.fit(text) model.getVectors.map { t => t._2.map(_.abs).sum }.sum / 100 / model.getVectors.size => res1: Float = 375059.84 val word2Vec = new Word2Vec() word2Vec. setVectorSize(100). setSeed(42L). setNumIterations(5). setNumPartitions(36). setMinCount(100) val model = word2Vec.fit(text) model.getVectors.map { t => t._2.map(_.abs).sum }.sum / 100 / model.getVectors.size => res3: Float = 1661285.2 val word2Vec = new Word2Vec() word2Vec. setVectorSize(100). setSeed(42L). setNumIterations(5). setNumPartitions(1) val model = word2Vec.fit(text) model.getVectors.map { t => t._2.map(_.abs).sum }.sum / 100 / model.getVectors.size => 0.13889 {code} was: Get data: {code:none} normalize_text() { awk '{print tolower($0);}' | sed -e "s/’/'/g" -e "s/′/'/g" -e "s/''/ /g" -e "s/'/ ' /g" -e "s/“/\"/g" -e "s/”/\"/g" \ -e 's/"/ " /g' -e 's/\./ \. /g' -e 's/<br \/>/ /g' -e 's/, / , /g' -e 's/(/ ( /g' -e 's/)/ ) /g' -e 's/\!/ \! /g' \ -e 's/\?/ \? /g' -e 's/\;/ /g' -e 's/\:/ /g' -e 's/-/ - /g' -e 's/=/ /g' -e 's/=/ /g' -e 's/*/ /g' -e 's/|/ /g' \ -e 's/«/ /g' | tr 0-9 " " } wget http://www.statmt.org/wmt14/training-monolingual-news-crawl/news.2013.en.shuffled.gz gzip -d news.2013.en.shuffled.gz normalize_text < news.2013.en.shuffled > data.txt {code} {code:none} import org.apache.spark.mllib.feature.Word2Vec val text = sc.textFile("dataPath").map { t => t.split(" ").toIterable } val word2Vec = new Word2Vec() word2Vec. setVectorSize(100). setSeed(42L). setNumIterations(5). setNumPartitions(36). setMinCount(100) val model = word2Vec.fit(text) model.getVectors.map { t => t._2.map(_.abs).sum }.sum / 100 / model.getVectors.size => res1: Float = 375059.84 val word2Vec = new Word2Vec() word2Vec. setVectorSize(100). setSeed(42L). setNumIterations(5). setNumPartitions(36). setMinCount(5) val model = word2Vec.fit(text) model.getVectors.map { t => t._2.map(_.abs).sum }.sum / 100 / model.getVectors.size => res3: Float = 1661285.2 val word2Vec = new Word2Vec() word2Vec. setVectorSize(100). setSeed(42L). setNumIterations(5). setNumPartitions(1) val model = word2Vec.fit(text) model.getVectors.map { t => t._2.map(_.abs).sum }.sum / 100 / model.getVectors.size => 0.13889 {code} > In some cases ,The value of word's vector representation is too big > ------------------------------------------------------------------- > > Key: SPARK-5261 > URL: https://issues.apache.org/jira/browse/SPARK-5261 > Project: Spark > Issue Type: Bug > Components: MLlib > Affects Versions: 1.2.0 > Reporter: Guoqiang Li > > Get data: > {code:none} > normalize_text() { > awk '{print tolower($0);}' | sed -e "s/’/'/g" -e "s/′/'/g" -e "s/''/ /g" -e > "s/'/ ' /g" -e "s/“/\"/g" -e "s/”/\"/g" \ > -e 's/"/ " /g' -e 's/\./ \. /g' -e 's/<br \/>/ /g' -e 's/, / , /g' -e 's/(/ > ( /g' -e 's/)/ ) /g' -e 's/\!/ \! /g' \ > -e 's/\?/ \? /g' -e 's/\;/ /g' -e 's/\:/ /g' -e 's/-/ - /g' -e 's/=/ /g' -e > 's/=/ /g' -e 's/*/ /g' -e 's/|/ /g' \ > -e 's/«/ /g' | tr 0-9 " " > } > wget > http://www.statmt.org/wmt14/training-monolingual-news-crawl/news.2013.en.shuffled.gz > gzip -d news.2013.en.shuffled.gz > normalize_text < news.2013.en.shuffled > data.txt > {code} > {code:none} > import org.apache.spark.mllib.feature.Word2Vec > val text = sc.textFile("dataPath").map { t => t.split(" ").toIterable } > val word2Vec = new Word2Vec() > word2Vec. > setVectorSize(100). > setSeed(42L). > setNumIterations(5). > setNumPartitions(36). > setMinCount(5) > val model = word2Vec.fit(text) > model.getVectors.map { t => t._2.map(_.abs).sum }.sum / 100 / > model.getVectors.size > => > res1: Float = 375059.84 > val word2Vec = new Word2Vec() > word2Vec. > setVectorSize(100). > setSeed(42L). > setNumIterations(5). > setNumPartitions(36). > setMinCount(100) > val model = word2Vec.fit(text) > model.getVectors.map { t => t._2.map(_.abs).sum }.sum / 100 / > model.getVectors.size > => > res3: Float = 1661285.2 > val word2Vec = new Word2Vec() > word2Vec. > setVectorSize(100). > setSeed(42L). > setNumIterations(5). > setNumPartitions(1) > val model = word2Vec.fit(text) > model.getVectors.map { t => t._2.map(_.abs).sum }.sum / 100 / > model.getVectors.size > => > 0.13889 > {code} -- This message was sent by Atlassian JIRA (v6.3.4#6332) --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org