Hi all, I'm still trying to create a very simple app to learn to use PredictionIO and still having trouble. I have done pio build no problem. But when I do pio train I get a very long error message related to serialisation (error message copied below).
pio status reports system is all ready to go. The app I'm trying to build is very simple, it only has 'view' events. Here's the engine.json: *===========================================================* { "comment":" This config file uses default settings for all but the required values see README.md for docs", "id": "default", "description": "Default settings", "engineFactory": "com.actionml.RecommendationEngine", "datasource": { "params" : { "name": "tiny_app_data.csv", "appName": "TinyApp", "eventNames": ["view"] } }, "algorithms": [ { "comment": "simplest setup where all values are default, popularity based backfill, must add eventsNames", "name": "ur", "params": { "appName": "TinyApp", "indexName": "urindex", "typeName": "items", "comment": "must have data for the first event or the model will not build, other events are optional", "eventNames": ["view"] } } ] } *===========================================================* The data I'm using is: "u1","i1" "u2","i1" "u2","i2" "u3","i2" "u3","i3" "u4","i4" meaning user u viewed item i. The data has been added to the database with the following python code: *===========================================================* """ Import sample data for recommendation engine """ import predictionio import argparse import random RATE_ACTIONS_DELIMITER = "," SEED = 1 def import_events(client, file): f = open(file, 'r') random.seed(SEED) count = 0 print "Importing data..." items = [] users = [] f = open(file, 'r') for line in f: data = line.rstrip('\r\n').split(RATE_ACTIONS_DELIMITER) users.append(data[0]) items.append(data[1]) client.create_event( event="view", entity_type="user", entity_id=data[0], target_entity_type="item", target_entity_id=data[1] ) print "Event: " + "view" + " entity_id: " + data[0] + " target_entity_id: " + data[1] count += 1 f.close() users = set(users) items = set(items) print "All users: " + str(users) print "All items: " + str(items) for item in items: client.create_event( event="$set", entity_type="item", entity_id=item ) count += 1 print "%s events are imported." % count if __name__ == '__main__': parser = argparse.ArgumentParser( description="Import sample data for recommendation engine") parser.add_argument('--access_key', default='invald_access_key') parser.add_argument('--url', default="http://localhost:7070") parser.add_argument('--file', default="./data/tiny_app_data.csv") args = parser.parse_args() print args client = predictionio.EventClient( access_key=args.access_key, url=args.url, threads=5, qsize=500) import_events(client, args.file) *===========================================================* My pio_env.sh is the following: *===========================================================* #!/usr/bin/env bash # # Copy this file as pio-env.sh and edit it for your site's configuration. # # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to You under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # PredictionIO Main Configuration # # This section controls core behavior of PredictionIO. It is very likely that # you need to change these to fit your site. # SPARK_HOME: Apache Spark is a hard dependency and must be configured. # SPARK_HOME=$PIO_HOME/vendors/spark-2.0.2-bin-hadoop2.7 SPARK_HOME=$PIO_HOME/vendors/spark-1.6.3-bin-hadoop2.6 POSTGRES_JDBC_DRIVER=$PIO_HOME/lib/postgresql-42.1.4.jar MYSQL_JDBC_DRIVER=$PIO_HOME/lib/mysql-connector-java-5.1.41.jar # ES_CONF_DIR: You must configure this if you have advanced configuration for # your Elasticsearch setup. # ES_CONF_DIR=/opt/elasticsearch #ES_CONF_DIR=$PIO_HOME/vendors/elasticsearch-1.7.6 # HADOOP_CONF_DIR: You must configure this if you intend to run PredictionIO # with Hadoop 2. # HADOOP_CONF_DIR=/opt/hadoop # HBASE_CONF_DIR: You must configure this if you intend to run PredictionIO # with HBase on a remote cluster. # HBASE_CONF_DIR=$PIO_HOME/vendors/hbase-1.0.0/conf # Filesystem paths where PredictionIO uses as block storage. PIO_FS_BASEDIR=$HOME/.pio_store PIO_FS_ENGINESDIR=$PIO_FS_BASEDIR/engines PIO_FS_TMPDIR=$PIO_FS_BASEDIR/tmp # PredictionIO Storage Configuration # # This section controls programs that make use of PredictionIO's built-in # storage facilities. Default values are shown below. # # For more information on storage configuration please refer to # http://predictionio.incubator.apache.org/system/anotherdatastore/ # Storage Repositories # Default is to use PostgreSQL PIO_STORAGE_REPOSITORIES_METADATA_NAME=pio_meta PIO_STORAGE_REPOSITORIES_METADATA_SOURCE=ELASTICSEARCH PIO_STORAGE_REPOSITORIES_EVENTDATA_NAME=pio_event PIO_STORAGE_REPOSITORIES_EVENTDATA_SOURCE=HBASE PIO_STORAGE_REPOSITORIES_MODELDATA_NAME=pio_model PIO_STORAGE_REPOSITORIES_MODELDATA_SOURCE=LOCALFS # Storage Data Sources # PostgreSQL Default Settings # Please change "pio" to your database name in PIO_STORAGE_SOURCES_PGSQL_URL # Please change PIO_STORAGE_SOURCES_PGSQL_USERNAME and # PIO_STORAGE_SOURCES_PGSQL_PASSWORD accordingly PIO_STORAGE_SOURCES_PGSQL_TYPE=jdbc PIO_STORAGE_SOURCES_PGSQL_URL=jdbc:postgresql://localhost/pio PIO_STORAGE_SOURCES_PGSQL_USERNAME=pio PIO_STORAGE_SOURCES_PGSQL_PASSWORD=pio # MySQL Example # PIO_STORAGE_SOURCES_MYSQL_TYPE=jdbc # PIO_STORAGE_SOURCES_MYSQL_URL=jdbc:mysql://localhost/pio # PIO_STORAGE_SOURCES_MYSQL_USERNAME=pio # PIO_STORAGE_SOURCES_MYSQL_PASSWORD=pio # Elasticsearch Example # PIO_STORAGE_SOURCES_ELASTICSEARCH_TYPE=elasticsearch # PIO_STORAGE_SOURCES_ELASTICSEARCH_HOSTS=localhost # PIO_STORAGE_SOURCES_ELASTICSEARCH_PORTS=9200 # PIO_STORAGE_SOURCES_ELASTICSEARCH_SCHEMES=http # PIO_STORAGE_SOURCES_ELASTICSEARCH_HOME=$PIO_HOME/vendors/elasticsearch-5.2.1 # Elasticsearch 1.x Example PIO_STORAGE_SOURCES_ELASTICSEARCH_TYPE=elasticsearch PIO_STORAGE_SOURCES_ELASTICSEARCH_CLUSTERNAME=myprojectES PIO_STORAGE_SOURCES_ELASTICSEARCH_HOSTS=localhost PIO_STORAGE_SOURCES_ELASTICSEARCH_PORTS=9300 PIO_STORAGE_SOURCES_ELASTICSEARCH_HOME=$PIO_HOME/vendors/elasticsearch-1.7.6 # Local File System Example PIO_STORAGE_SOURCES_LOCALFS_TYPE=localfs PIO_STORAGE_SOURCES_LOCALFS_PATH=$PIO_FS_BASEDIR/models # HBase Example PIO_STORAGE_SOURCES_HBASE_TYPE=hbase PIO_STORAGE_SOURCES_HBASE_HOME=$PIO_HOME/vendors/hbase-1.2.6 *===========================================================Error message:* *===========================================================* [ERROR] [TaskSetManager] Task 2.0 in stage 10.0 (TID 24) had a not serializable result: org.apache.mahout.math.RandomAccessSparseVector Serialization stack: - object not serializable (class: org.apache.mahout.math.RandomAccessSparseVector, value: {3:1.0,2:1.0}) - field (class: scala.Tuple2, name: _2, type: class java.lang.Object) - object (class scala.Tuple2, (2,{3:1.0,2:1.0})); not retrying [ERROR] [TaskSetManager] Task 3.0 in stage 10.0 (TID 25) had a not serializable result: org.apache.mahout.math.RandomAccessSparseVector Serialization stack: - object not serializable (class: org.apache.mahout.math.RandomAccessSparseVector, value: {0:1.0,3:1.0}) - field (class: scala.Tuple2, name: _2, type: class java.lang.Object) - object (class scala.Tuple2, (3,{0:1.0,3:1.0})); not retrying [ERROR] [TaskSetManager] Task 1.0 in stage 10.0 (TID 23) had a not serializable result: org.apache.mahout.math.RandomAccessSparseVector Serialization stack: - object not serializable (class: org.apache.mahout.math.RandomAccessSparseVector, value: {1:1.0}) - field (class: scala.Tuple2, name: _2, type: class java.lang.Object) - object (class scala.Tuple2, (1,{1:1.0})); not retrying [ERROR] [TaskSetManager] Task 0.0 in stage 10.0 (TID 22) had a not serializable result: org.apache.mahout.math.RandomAccessSparseVector Serialization stack: - object not serializable (class: org.apache.mahout.math.RandomAccessSparseVector, value: {0:1.0}) - field (class: scala.Tuple2, name: _2, type: class java.lang.Object) - object (class scala.Tuple2, (0,{0:1.0})); not retrying Exception in thread "main" org.apache.spark.SparkException: Job aborted due to stage failure: Task 2.0 in stage 10.0 (TID 24) had a not serializable result: org.apache.mahout.math.RandomAccessSparseVector Serialization stack: - object not serializable (class: org.apache.mahout.math.RandomAccessSparseVector, value: {3:1.0,2:1.0}) - field (class: scala.Tuple2, name: _2, type: class java.lang.Object) - object (class scala.Tuple2, (2,{3:1.0,2:1.0})) at org.apache.spark.scheduler.DAGScheduler.org $apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1431) at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1419) at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1418) at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59) at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47) at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1418) at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:799) at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:799) at scala.Option.foreach(Option.scala:236) at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:799) at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1640) at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1599) at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1588) at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48) at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:620) at org.apache.spark.SparkContext.runJob(SparkContext.scala:1832) at org.apache.spark.SparkContext.runJob(SparkContext.scala:1952) at org.apache.spark.rdd.RDD$$anonfun$fold$1.apply(RDD.scala:1088) at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:150) at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:111) at org.apache.spark.rdd.RDD.withScope(RDD.scala:316) at org.apache.spark.rdd.RDD.fold(RDD.scala:1082) at org.apache.mahout.sparkbindings.drm.CheckpointedDrmSpark.computeNRow(CheckpointedDrmSpark.scala:188) at org.apache.mahout.sparkbindings.drm.CheckpointedDrmSpark.nrow$lzycompute(CheckpointedDrmSpark.scala:55) at org.apache.mahout.sparkbindings.drm.CheckpointedDrmSpark.nrow(CheckpointedDrmSpark.scala:55) at org.apache.mahout.sparkbindings.drm.CheckpointedDrmSpark.newRowCardinality(CheckpointedDrmSpark.scala:219) at com.actionml.IndexedDatasetSpark$.apply(Preparator.scala:213) at com.actionml.Preparator$$anonfun$3.apply(Preparator.scala:71) at com.actionml.Preparator$$anonfun$3.apply(Preparator.scala:49) at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:244) at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:244) at scala.collection.immutable.List.foreach(List.scala:318) at scala.collection.TraversableLike$class.map(TraversableLike.scala:244) at scala.collection.AbstractTraversable.map(Traversable.scala:105) at com.actionml.Preparator.prepare(Preparator.scala:49) at com.actionml.Preparator.prepare(Preparator.scala:32) at org.apache.predictionio.controller.PPreparator.prepareBase(PPreparator.scala:37) at org.apache.predictionio.controller.Engine$.train(Engine.scala:671) at org.apache.predictionio.controller.Engine.train(Engine.scala:177) at org.apache.predictionio.workflow.CoreWorkflow$.runTrain(CoreWorkflow.scala:67) at org.apache.predictionio.workflow.CreateWorkflow$.main(CreateWorkflow.scala:250) at org.apache.predictionio.workflow.CreateWorkflow.main(CreateWorkflow.scala) at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) at java.lang.reflect.Method.invoke(Method.java:498) at org.apache.spark.deploy.SparkSubmit$.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:731) at org.apache.spark.deploy.SparkSubmit$.doRunMain$1(SparkSubmit.scala:181) at org.apache.spark.deploy.SparkSubmit$.submit(SparkSubmit.scala:206) at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:121) at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala) *===========================================================* Thank you all for your help. Best regards, noelia