[ 
https://issues.apache.org/jira/browse/SPARK-20445?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

surya pratap updated SPARK-20445:
---------------------------------
    Comment: was deleted

(was: Hello Hyukjin Kwon,
Thxz for fast reply.
You are using which version  and which environment.
I am running my code on MapR having spark version 1.6v)

> pyspark.sql.utils.IllegalArgumentException: u'DecisionTreeClassifier was 
> given input with invalid label column label, without the number of classes 
> specified. See StringIndexer
> --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
>
>                 Key: SPARK-20445
>                 URL: https://issues.apache.org/jira/browse/SPARK-20445
>             Project: Spark
>          Issue Type: Bug
>          Components: MLlib
>    Affects Versions: 1.6.1
>            Reporter: surya pratap
>
>  #Load the CSV file into a RDD
>     irisData = sc.textFile("/home/infademo/surya/iris.csv")
>     irisData.cache()
>     irisData.count()
>     #Remove the first line (contains headers)
>     dataLines = irisData.filter(lambda x: "Sepal" not in x)
>     dataLines.count()
>     from pyspark.sql import Row
>     #Create a Data Frame from the data
>     parts = dataLines.map(lambda l: l.split(","))
>     irisMap = parts.map(lambda p: Row(SEPAL_LENGTH=float(p[0]),\
>                                     SEPAL_WIDTH=float(p[1]), \
>                                     PETAL_LENGTH=float(p[2]), \
>                                     PETAL_WIDTH=float(p[3]), \
>                                     SPECIES=p[4] ))
>     # Infer the schema, and register the DataFrame as a table.
>     irisDf = sqlContext.createDataFrame(irisMap)
>     irisDf.cache()
>     #Add a numeric indexer for the label/target column
>     from pyspark.ml.feature import StringIndexer
>     stringIndexer = StringIndexer(inputCol="SPECIES", outputCol="IND_SPECIES")
>     si_model = stringIndexer.fit(irisDf)
>     irisNormDf = si_model.transform(irisDf)
>     irisNormDf.select("SPECIES","IND_SPECIES").distinct().collect()
>     irisNormDf.cache()
>     
> """--------------------------------------------------------------------------
>     Perform Data Analytics
>     
> -------------------------------------------------------------------------"""
>     #See standard parameters
>     irisNormDf.describe().show()
>     #Find correlation between predictors and target
>     for i in irisNormDf.columns:
>         if not( isinstance(irisNormDf.select(i).take(1)[0][0], basestring)) :
>             print( "Correlation to Species for ", i, \
>                         irisNormDf.stat.corr('IND_SPECIES',i))
>     #Transform to a Data Frame for input to Machine Learing
>     #Drop columns that are not required (low correlation)
>     from pyspark.mllib.linalg import Vectors
>     from pyspark.mllib.linalg import SparseVector
>     from pyspark.mllib.regression import LabeledPoint
>     from pyspark.mllib.util import MLUtils
>     import org.apache.spark.mllib.linalg.{Matrix, Matrices}
>     from pyspark.mllib.linalg.distributed import RowMatrix
>     from pyspark.ml.linalg import Vectors
>     pyspark.mllib.linalg.Vector
>     def transformToLabeledPoint(row) :
>         lp = ( row["SPECIES"], row["IND_SPECIES"], \
>                     Vectors.dense([row["SEPAL_LENGTH"],\
>                             row["SEPAL_WIDTH"], \
>                             row["PETAL_LENGTH"], \
>                             row["PETAL_WIDTH"]]))
>         return lp
>     irisLp = irisNormDf.rdd.map(transformToLabeledPoint)
>     irisLpDf = sqlContext.createDataFrame(irisLp,["species","label", 
> "features"])
>     irisLpDf.select("species","label","features").show(10)
>     irisLpDf.cache()
>     
> """--------------------------------------------------------------------------
>     Perform Machine Learning
>     
> -------------------------------------------------------------------------"""
>     #Split into training and testing data
>     (trainingData, testData) = irisLpDf.randomSplit([0.9, 0.1])
>     trainingData.count()
>     testData.count()
>     testData.collect()
>     from pyspark.ml.classification import DecisionTreeClassifier
>     from pyspark.ml.evaluation import MulticlassClassificationEvaluator
>     #Create the model
>     dtClassifer = DecisionTreeClassifier(maxDepth=2, labelCol="label",\
>                     featuresCol="features")
>    dtModel = dtClassifer.fit(trainingData)
>    
>    issue part:-
>    
>    dtModel = dtClassifer.fit(trainingData) Traceback (most recent call last): 
> File "", line 1, in File 
> "/opt/mapr/spark/spark-1.6.1-bin-hadoop2.6/python/pyspark/ml/pipeline.py", 
> line 69, in fit return self._fit(dataset) File 
> "/opt/mapr/spark/spark-1.6.1-bin-hadoop2.6/python/pyspark/ml/wrapper.py", 
> line 133, in _fit java_model = self._fit_java(dataset) File 
> "/opt/mapr/spark/spark-1.6.1-bin-hadoop2.6/python/pyspark/ml/wrapper.py", 
> line 130, in _fit_java return self._java_obj.fit(dataset._jdf) File 
> "/opt/mapr/spark/spark-1.6.1-bin-hadoop2.6/python/lib/py4j-0.9-src.zip/py4j/java_gateway.py",
>  line 813, in call File 
> "/opt/mapr/spark/spark-1.6.1-bin-hadoop2.6/python/pyspark/sql/utils.py", line 
> 53, in deco raise IllegalArgumentException(s.split(': ', 1)[1], stackTrace) 
> pyspark.sql.utils.IllegalArgumentException: u'DecisionTreeClassifier was 
> given input with invalid label column label, without the number of classes 
> specified. See StringIndexer.'



--
This message was sent by Atlassian JIRA
(v6.3.15#6346)

---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org
For additional commands, e-mail: issues-h...@spark.apache.org

Reply via email to