[ https://issues.apache.org/jira/browse/SPARK-20445?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
surya pratap updated SPARK-20445: --------------------------------- Comment: was deleted (was: Hello Hyukjin Kwon, Thxz for fast reply. You are using which version and which environment. I am running my code on MapR having spark version 1.6v) > pyspark.sql.utils.IllegalArgumentException: u'DecisionTreeClassifier was > given input with invalid label column label, without the number of classes > specified. See StringIndexer > -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- > > Key: SPARK-20445 > URL: https://issues.apache.org/jira/browse/SPARK-20445 > Project: Spark > Issue Type: Bug > Components: MLlib > Affects Versions: 1.6.1 > Reporter: surya pratap > > #Load the CSV file into a RDD > irisData = sc.textFile("/home/infademo/surya/iris.csv") > irisData.cache() > irisData.count() > #Remove the first line (contains headers) > dataLines = irisData.filter(lambda x: "Sepal" not in x) > dataLines.count() > from pyspark.sql import Row > #Create a Data Frame from the data > parts = dataLines.map(lambda l: l.split(",")) > irisMap = parts.map(lambda p: Row(SEPAL_LENGTH=float(p[0]),\ > SEPAL_WIDTH=float(p[1]), \ > PETAL_LENGTH=float(p[2]), \ > PETAL_WIDTH=float(p[3]), \ > SPECIES=p[4] )) > # Infer the schema, and register the DataFrame as a table. > irisDf = sqlContext.createDataFrame(irisMap) > irisDf.cache() > #Add a numeric indexer for the label/target column > from pyspark.ml.feature import StringIndexer > stringIndexer = StringIndexer(inputCol="SPECIES", outputCol="IND_SPECIES") > si_model = stringIndexer.fit(irisDf) > irisNormDf = si_model.transform(irisDf) > irisNormDf.select("SPECIES","IND_SPECIES").distinct().collect() > irisNormDf.cache() > > """-------------------------------------------------------------------------- > Perform Data Analytics > > -------------------------------------------------------------------------""" > #See standard parameters > irisNormDf.describe().show() > #Find correlation between predictors and target > for i in irisNormDf.columns: > if not( isinstance(irisNormDf.select(i).take(1)[0][0], basestring)) : > print( "Correlation to Species for ", i, \ > irisNormDf.stat.corr('IND_SPECIES',i)) > #Transform to a Data Frame for input to Machine Learing > #Drop columns that are not required (low correlation) > from pyspark.mllib.linalg import Vectors > from pyspark.mllib.linalg import SparseVector > from pyspark.mllib.regression import LabeledPoint > from pyspark.mllib.util import MLUtils > import org.apache.spark.mllib.linalg.{Matrix, Matrices} > from pyspark.mllib.linalg.distributed import RowMatrix > from pyspark.ml.linalg import Vectors > pyspark.mllib.linalg.Vector > def transformToLabeledPoint(row) : > lp = ( row["SPECIES"], row["IND_SPECIES"], \ > Vectors.dense([row["SEPAL_LENGTH"],\ > row["SEPAL_WIDTH"], \ > row["PETAL_LENGTH"], \ > row["PETAL_WIDTH"]])) > return lp > irisLp = irisNormDf.rdd.map(transformToLabeledPoint) > irisLpDf = sqlContext.createDataFrame(irisLp,["species","label", > "features"]) > irisLpDf.select("species","label","features").show(10) > irisLpDf.cache() > > """-------------------------------------------------------------------------- > Perform Machine Learning > > -------------------------------------------------------------------------""" > #Split into training and testing data > (trainingData, testData) = irisLpDf.randomSplit([0.9, 0.1]) > trainingData.count() > testData.count() > testData.collect() > from pyspark.ml.classification import DecisionTreeClassifier > from pyspark.ml.evaluation import MulticlassClassificationEvaluator > #Create the model > dtClassifer = DecisionTreeClassifier(maxDepth=2, labelCol="label",\ > featuresCol="features") > dtModel = dtClassifer.fit(trainingData) > > issue part:- > > dtModel = dtClassifer.fit(trainingData) Traceback (most recent call last): > File "", line 1, in File > "/opt/mapr/spark/spark-1.6.1-bin-hadoop2.6/python/pyspark/ml/pipeline.py", > line 69, in fit return self._fit(dataset) File > "/opt/mapr/spark/spark-1.6.1-bin-hadoop2.6/python/pyspark/ml/wrapper.py", > line 133, in _fit java_model = self._fit_java(dataset) File > "/opt/mapr/spark/spark-1.6.1-bin-hadoop2.6/python/pyspark/ml/wrapper.py", > line 130, in _fit_java return self._java_obj.fit(dataset._jdf) File > "/opt/mapr/spark/spark-1.6.1-bin-hadoop2.6/python/lib/py4j-0.9-src.zip/py4j/java_gateway.py", > line 813, in call File > "/opt/mapr/spark/spark-1.6.1-bin-hadoop2.6/python/pyspark/sql/utils.py", line > 53, in deco raise IllegalArgumentException(s.split(': ', 1)[1], stackTrace) > pyspark.sql.utils.IllegalArgumentException: u'DecisionTreeClassifier was > given input with invalid label column label, without the number of classes > specified. See StringIndexer.' -- This message was sent by Atlassian JIRA (v6.3.15#6346) --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org