[ https://issues.apache.org/jira/browse/SPARK-14153?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=15213765#comment-15213765 ]
Dulaj Rajitha edited comment on SPARK-14153 at 3/28/16 3:56 AM: ---------------------------------------------------------------- This is the java code I used.. package ml.test; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.ml.evaluation.RegressionEvaluator; import org.apache.spark.ml.param.ParamMap; import org.apache.spark.ml.recommendation.ALS; import org.apache.spark.ml.recommendation.ALSModel; import org.apache.spark.ml.tuning.CrossValidator; import org.apache.spark.ml.tuning.CrossValidatorModel; import org.apache.spark.ml.tuning.ParamGridBuilder; import org.apache.spark.sql.DataFrame; import org.apache.spark.sql.SQLContext; import org.apache.spark.sql.types.DataTypes; /** * @author Dulaj Pathirana - Mar 14, 2016 */ public class ALSImplicitTest { static JavaSparkContext jsc; final static String sparkMaster = "spark://192.168.1.71:7077"; final static String dataPathPrefix = "hdfs://192.168.1.71/res/"; final static String trainDataFile = dataPathPrefix + "train.csv"; final static String testDataFile = dataPathPrefix + "test.csv"; final static String modelPath = dataPathPrefix + "als_implicit1.model"; final static String sparkLogDir = dataPathPrefix + "logs/"; static DataFrame test; static DataFrame train; public static void main( String[] args ) { final int folds = 2; final int[] ranks = { 120, 500 }; double alpha = 0.01, regParam = 0.02, tuningInterval = 0.01; final double[] alphas = prepareDoubleParams( alpha, tuningInterval, 1 ); final double[] regParams = prepareDoubleParams( regParam, tuningInterval, 1 ); prepareDataFrames(); // Build the recommendation model using ALS on the training data // numBlocks is the number of blocks used to parallelize computation (set to -1 to auto-configure). // rank is the number of latent factors in the model. // iterations is the number of iterations to run. // lambda specifies the regularization parameter in ALS. // implicitPrefs specifies whether to use the explicit feedback ALS variant or one adapted for implicit feedback data. // alpha is a parameter applicable to the implicit feedback variant of ALS that governs the baseline confidence in preference observations. ALS implicitALS = new ALS().setImplicitPrefs( true ).setUserCol( "user" ).setItemCol( "item" ) .setRatingCol( "confidence" ).setPredictionCol( "prediction" ); ParamMap[] paramMaps = new ParamGridBuilder().addGrid( implicitALS.alpha(), alphas ) .addGrid( implicitALS.regParam(), regParams ).addGrid( implicitALS.rank(), ranks ).build(); RegressionEvaluator evaluator = new RegressionEvaluator().setMetricName( "rmse" ).setLabelCol( "confidence" ) .setPredictionCol( "prediction" ); CrossValidator crossValidator = new CrossValidator().setEstimator( implicitALS ).setEvaluator( evaluator ) .setEstimatorParamMaps( paramMaps ).setNumFolds( folds ); CrossValidatorModel crossValidatorModel = crossValidator.fit( train ); // save model ALSModel alsModel = ( ALSModel ) crossValidatorModel.bestModel(); alsModel.write().overwrite().saveImpl( modelPath ); // load model ALSModel bestModel = ALSModel.read().load( modelPath ); // predict DataFrame predictDf = bestModel.transform( train.randomSplit( new double[] { 0.8, 0.2 } )[1] ); DataFrame predictions = predictDf .withColumn( "confidence", predictDf.col( "confidence" ).cast( DataTypes.DoubleType ) ) .withColumn( "prediction", predictDf.col( "prediction" ) ); predictDf.show(); System.out.println( "Root-mean-square error = " + evaluator.evaluate( predictions ) ); jsc.stop(); } private static void prepareDataFrames() { final SparkConf conf = new SparkConf().setAppName( "ALS-Implict with cross validation Model" ) .setMaster( sparkMaster ).set( "spark.executor.memory", "4g" ).set( "spark.eventLog.dir", sparkLogDir ) .set( "spark.eventLog.enabled", "false" ); jsc = new JavaSparkContext( conf ); jsc.addJar( dataPathPrefix + "spark-csv_2.10-1.3.0.jar" ); jsc.addJar( dataPathPrefix + "commons-csv-1.2.jar" ); final SQLContext sqlContext = new SQLContext( jsc ); DataFrame tst = sqlContext.read().format( "com.databricks.spark.csv" ).option( "inferSchema", "true" ) .option( "header", "true" ).load( testDataFile ); test = tst.withColumn( "confidence", tst.col( "confidence" ).cast( DataTypes.DoubleType ) ).cache(); DataFrame trn = sqlContext.read().format( "com.databricks.spark.csv" ).option( "inferSchema", "true" ) .option( "header", "true" ).load( trainDataFile ); train = trn.withColumn( "confidence", trn.col( "confidence" ).cast( DataTypes.DoubleType ) ).cache(); } private static double[] prepareDoubleParams( double start, double interval, int steps ) { double[] res = new double[steps + 1]; res[0] = start; for ( int i = 1; i <= steps; i++ ) { res[i] = ( start += interval ); } return res; } } was (Author: dulajrajitha): This is the java code I used.. package it.codegen.rnd.ml.test; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.ml.evaluation.RegressionEvaluator; import org.apache.spark.ml.param.ParamMap; import org.apache.spark.ml.recommendation.ALS; import org.apache.spark.ml.recommendation.ALSModel; import org.apache.spark.ml.tuning.CrossValidator; import org.apache.spark.ml.tuning.CrossValidatorModel; import org.apache.spark.ml.tuning.ParamGridBuilder; import org.apache.spark.sql.DataFrame; import org.apache.spark.sql.SQLContext; import org.apache.spark.sql.types.DataTypes; /** * @author Dulaj Pathirana - Mar 14, 2016 */ public class ALSImplicitTest { static JavaSparkContext jsc; final static String sparkMaster = "spark://192.168.1.71:7077"; final static String dataPathPrefix = "hdfs://192.168.1.71/res/"; final static String trainDataFile = dataPathPrefix + "train.csv"; final static String testDataFile = dataPathPrefix + "test.csv"; final static String modelPath = dataPathPrefix + "als_implicit1.model"; final static String sparkLogDir = dataPathPrefix + "logs/"; static DataFrame test; static DataFrame train; public static void main( String[] args ) { final int folds = 2; final int[] ranks = { 120, 500 }; double alpha = 0.01, regParam = 0.02, tuningInterval = 0.01; final double[] alphas = prepareDoubleParams( alpha, tuningInterval, 1 ); final double[] regParams = prepareDoubleParams( regParam, tuningInterval, 1 ); prepareDataFrames(); // Build the recommendation model using ALS on the training data // numBlocks is the number of blocks used to parallelize computation (set to -1 to auto-configure). // rank is the number of latent factors in the model. // iterations is the number of iterations to run. // lambda specifies the regularization parameter in ALS. // implicitPrefs specifies whether to use the explicit feedback ALS variant or one adapted for implicit feedback data. // alpha is a parameter applicable to the implicit feedback variant of ALS that governs the baseline confidence in preference observations. ALS implicitALS = new ALS().setImplicitPrefs( true ).setUserCol( "user" ).setItemCol( "item" ) .setRatingCol( "confidence" ).setPredictionCol( "prediction" ); ParamMap[] paramMaps = new ParamGridBuilder().addGrid( implicitALS.alpha(), alphas ) .addGrid( implicitALS.regParam(), regParams ).addGrid( implicitALS.rank(), ranks ).build(); RegressionEvaluator evaluator = new RegressionEvaluator().setMetricName( "rmse" ).setLabelCol( "confidence" ) .setPredictionCol( "prediction" ); CrossValidator crossValidator = new CrossValidator().setEstimator( implicitALS ).setEvaluator( evaluator ) .setEstimatorParamMaps( paramMaps ).setNumFolds( folds ); CrossValidatorModel crossValidatorModel = crossValidator.fit( train ); // save model ALSModel alsModel = ( ALSModel ) crossValidatorModel.bestModel(); alsModel.write().overwrite().saveImpl( modelPath ); // load model ALSModel bestModel = ALSModel.read().load( modelPath ); // predict DataFrame predictDf = bestModel.transform( train.randomSplit( new double[] { 0.8, 0.2 } )[1] ); DataFrame predictions = predictDf .withColumn( "confidence", predictDf.col( "confidence" ).cast( DataTypes.DoubleType ) ) .withColumn( "prediction", predictDf.col( "prediction" ) ); predictDf.show(); System.out.println( "Root-mean-square error = " + evaluator.evaluate( predictions ) ); jsc.stop(); } private static void prepareDataFrames() { final SparkConf conf = new SparkConf().setAppName( "ALS-Implict with cross validation Model" ) .setMaster( sparkMaster ).set( "spark.executor.memory", "4g" ).set( "spark.eventLog.dir", sparkLogDir ) .set( "spark.eventLog.enabled", "false" ); jsc = new JavaSparkContext( conf ); jsc.addJar( dataPathPrefix + "spark-csv_2.10-1.3.0.jar" ); jsc.addJar( dataPathPrefix + "commons-csv-1.2.jar" ); final SQLContext sqlContext = new SQLContext( jsc ); DataFrame tst = sqlContext.read().format( "com.databricks.spark.csv" ).option( "inferSchema", "true" ) .option( "header", "true" ).load( testDataFile ); test = tst.withColumn( "confidence", tst.col( "confidence" ).cast( DataTypes.DoubleType ) ).cache(); DataFrame trn = sqlContext.read().format( "com.databricks.spark.csv" ).option( "inferSchema", "true" ) .option( "header", "true" ).load( trainDataFile ); train = trn.withColumn( "confidence", trn.col( "confidence" ).cast( DataTypes.DoubleType ) ).cache(); } private static double[] prepareDoubleParams( double start, double interval, int steps ) { double[] res = new double[steps + 1]; res[0] = start; for ( int i = 1; i <= steps; i++ ) { res[i] = ( start += interval ); } return res; } } > My dataset does not provide proper predictions in ALS > ----------------------------------------------------- > > Key: SPARK-14153 > URL: https://issues.apache.org/jira/browse/SPARK-14153 > Project: Spark > Issue Type: Question > Components: Java API, ML > Reporter: Dulaj Rajitha > > When I used data-set in the git-hub example, I get proper predictions. But > when I used my data set It does not predict well. (I has a large RMSE). > I used cross validator for ALS (in Spark ML) and here are the best model > parameters. > 16/03/25 12:03:06 INFO CrossValidator: Average cross-validation metrics: > WrappedArray(NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN) > 16/03/25 12:03:06 INFO CrossValidator: Best set of parameters: > { > als_c911c0e183a3-alpha: 0.02, > als_c911c0e183a3-rank: 500, > als_c911c0e183a3-regParam: 0.03 > } > But when I used movie data set It gives proper values for parameters. as below > 16/03/24 14:07:07 INFO CrossValidator: Average cross-validation metrics: > WrappedArray(1.9481584447713676, 2.0501457159728944, 2.0600857505406935, > 1.9457234533860048, 2.0494498583414282, 2.0595306613827002, > 1.9488322049918922, 2.0489573853226797, 2.0584252131752, 1.9464006741621391, > 2.048241271354197, 2.057853990227443) > 16/03/24 14:07:07 INFO CrossValidator: Best set of parameters: > { > als_31a605e7717b-alpha: 0.02, > als_31a605e7717b-rank: 1, > als_31a605e7717b-regParam: 0.02 > } > 16/03/24 14:07:07 INFO CrossValidator: Best cross-validation metric: > 1.9457234533860048. -- This message was sent by Atlassian JIRA (v6.3.4#6332) --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org