[ 
https://issues.apache.org/jira/browse/SPARK-12110?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Patrick Wendell updated SPARK-12110:
------------------------------------
    Description: 
I am using spark-1.5.1-bin-hadoop2.6. I used 
spark-1.5.1-bin-hadoop2.6/ec2/spark-ec2 to create a cluster and configured 
spark-env to use python3. I can not run the tokenizer sample code. Is there a 
work around?

Kind regards

Andy

{code}
/root/spark/python/pyspark/sql/context.py in _ssql_ctx(self)
    658             raise Exception("You must build Spark with Hive. "
    659                             "Export 'SPARK_HIVE=true' and run "
--> 660                             "build/sbt assembly", e)
    661 
    662     def _get_hive_ctx(self):

Exception: ("You must build Spark with Hive. Export 'SPARK_HIVE=true' and run 
build/sbt assembly", Py4JJavaError('An error occurred while calling 
None.org.apache.spark.sql.hive.HiveContext.\n', JavaObject id=o38))




http://spark.apache.org/docs/latest/ml-features.html#tokenizer

from pyspark.ml.feature import Tokenizer, RegexTokenizer

sentenceDataFrame = sqlContext.createDataFrame([
  (0, "Hi I heard about Spark"),
  (1, "I wish Java could use case classes"),
  (2, "Logistic,regression,models,are,neat")
], ["label", "sentence"])
tokenizer = Tokenizer(inputCol="sentence", outputCol="words")
wordsDataFrame = tokenizer.transform(sentenceDataFrame)
for words_label in wordsDataFrame.select("words", "label").take(3):
  print(words_label)

---------------------------------------------------------------------------
Py4JJavaError                             Traceback (most recent call last)
/root/spark/python/pyspark/sql/context.py in _ssql_ctx(self)
    654             if not hasattr(self, '_scala_HiveContext'):
--> 655                 self._scala_HiveContext = self._get_hive_ctx()
    656             return self._scala_HiveContext

/root/spark/python/pyspark/sql/context.py in _get_hive_ctx(self)
    662     def _get_hive_ctx(self):
--> 663         return self._jvm.HiveContext(self._jsc.sc())
    664 

/root/spark/python/lib/py4j-0.8.2.1-src.zip/py4j/java_gateway.py in 
__call__(self, *args)
    700         return_value = get_return_value(answer, self._gateway_client, 
None,
--> 701                 self._fqn)
    702 

/root/spark/python/pyspark/sql/utils.py in deco(*a, **kw)
     35         try:
---> 36             return f(*a, **kw)
     37         except py4j.protocol.Py4JJavaError as e:

/root/spark/python/lib/py4j-0.8.2.1-src.zip/py4j/protocol.py in 
get_return_value(answer, gateway_client, target_id, name)
    299                     'An error occurred while calling {0}{1}{2}.\n'.
--> 300                     format(target_id, '.', name), value)
    301             else:

Py4JJavaError: An error occurred while calling 
None.org.apache.spark.sql.hive.HiveContext.
: java.lang.RuntimeException: java.io.IOException: Filesystem closed
        at 
org.apache.hadoop.hive.ql.session.SessionState.start(SessionState.java:522)
        at 
org.apache.spark.sql.hive.client.ClientWrapper.<init>(ClientWrapper.scala:171)
        at 
org.apache.spark.sql.hive.HiveContext.executionHive$lzycompute(HiveContext.scala:162)
        at 
org.apache.spark.sql.hive.HiveContext.executionHive(HiveContext.scala:160)
        at org.apache.spark.sql.hive.HiveContext.<init>(HiveContext.scala:167)
        at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
        at 
sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62)
        at 
sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
        at java.lang.reflect.Constructor.newInstance(Constructor.java:422)
        at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:234)
        at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:379)
        at py4j.Gateway.invoke(Gateway.java:214)
        at 
py4j.commands.ConstructorCommand.invokeConstructor(ConstructorCommand.java:79)
        at py4j.commands.ConstructorCommand.execute(ConstructorCommand.java:68)
        at py4j.GatewayConnection.run(GatewayConnection.java:207)
        at java.lang.Thread.run(Thread.java:745)
Caused by: java.io.IOException: Filesystem closed
        at org.apache.hadoop.hdfs.DFSClient.checkOpen(DFSClient.java:323)
        at org.apache.hadoop.hdfs.DFSClient.getFileInfo(DFSClient.java:1057)
        at 
org.apache.hadoop.hdfs.DistributedFileSystem.getFileStatus(DistributedFileSystem.java:554)
        at 
org.apache.hadoop.hive.ql.session.SessionState.createRootHDFSDir(SessionState.java:599)
        at 
org.apache.hadoop.hive.ql.session.SessionState.createSessionDirs(SessionState.java:554)
        at 
org.apache.hadoop.hive.ql.session.SessionState.start(SessionState.java:508)
        ... 15 more


During handling of the above exception, another exception occurred:

Exception                                 Traceback (most recent call last)
<ipython-input-1-0beb490d573c> in <module>()
      5   (1, "I wish Java could use case classes"),
      6   (2, "Logistic,regression,models,are,neat")
----> 7 ], ["label", "sentence"])
      8 tokenizer = Tokenizer(inputCol="sentence", outputCol="words")
      9 wordsDataFrame = tokenizer.transform(sentenceDataFrame)

/root/spark/python/pyspark/sql/context.py in createDataFrame(self, data, 
schema, samplingRatio)
    406             rdd, schema = self._createFromLocal(data, schema)
    407         jrdd = 
self._jvm.SerDeUtil.toJavaArray(rdd._to_java_object_rdd())
--> 408         jdf = self._ssql_ctx.applySchemaToPythonRDD(jrdd.rdd(), 
schema.json())
    409         df = DataFrame(jdf, self)
    410         df._schema = schema

/root/spark/python/pyspark/sql/context.py in _ssql_ctx(self)
    658             raise Exception("You must build Spark with Hive. "
    659                             "Export 'SPARK_HIVE=true' and run "
--> 660                             "build/sbt assembly", e)
    661 
    662     def _get_hive_ctx(self):

Exception: ("You must build Spark with Hive. Export 'SPARK_HIVE=true' and run 
build/sbt assembly", Py4JJavaError('An error occurred while calling 
None.org.apache.spark.sql.hive.HiveContext.\n', JavaObject id=o38))
{code}


  was:
I am using spark-1.5.1-bin-hadoop2.6. I used 
spark-1.5.1-bin-hadoop2.6/ec2/spark-ec2 to create a cluster and configured 
spark-env to use python3. I can not run the tokenizer sample code. Is there a 
work around?

Kind regards

Andy

/root/spark/python/pyspark/sql/context.py in _ssql_ctx(self)
    658             raise Exception("You must build Spark with Hive. "
    659                             "Export 'SPARK_HIVE=true' and run "
--> 660                             "build/sbt assembly", e)
    661 
    662     def _get_hive_ctx(self):

Exception: ("You must build Spark with Hive. Export 'SPARK_HIVE=true' and run 
build/sbt assembly", Py4JJavaError('An error occurred while calling 
None.org.apache.spark.sql.hive.HiveContext.\n', JavaObject id=o38))




http://spark.apache.org/docs/latest/ml-features.html#tokenizer

from pyspark.ml.feature import Tokenizer, RegexTokenizer

sentenceDataFrame = sqlContext.createDataFrame([
  (0, "Hi I heard about Spark"),
  (1, "I wish Java could use case classes"),
  (2, "Logistic,regression,models,are,neat")
], ["label", "sentence"])
tokenizer = Tokenizer(inputCol="sentence", outputCol="words")
wordsDataFrame = tokenizer.transform(sentenceDataFrame)
for words_label in wordsDataFrame.select("words", "label").take(3):
  print(words_label)

---------------------------------------------------------------------------
Py4JJavaError                             Traceback (most recent call last)
/root/spark/python/pyspark/sql/context.py in _ssql_ctx(self)
    654             if not hasattr(self, '_scala_HiveContext'):
--> 655                 self._scala_HiveContext = self._get_hive_ctx()
    656             return self._scala_HiveContext

/root/spark/python/pyspark/sql/context.py in _get_hive_ctx(self)
    662     def _get_hive_ctx(self):
--> 663         return self._jvm.HiveContext(self._jsc.sc())
    664 

/root/spark/python/lib/py4j-0.8.2.1-src.zip/py4j/java_gateway.py in 
__call__(self, *args)
    700         return_value = get_return_value(answer, self._gateway_client, 
None,
--> 701                 self._fqn)
    702 

/root/spark/python/pyspark/sql/utils.py in deco(*a, **kw)
     35         try:
---> 36             return f(*a, **kw)
     37         except py4j.protocol.Py4JJavaError as e:

/root/spark/python/lib/py4j-0.8.2.1-src.zip/py4j/protocol.py in 
get_return_value(answer, gateway_client, target_id, name)
    299                     'An error occurred while calling {0}{1}{2}.\n'.
--> 300                     format(target_id, '.', name), value)
    301             else:

Py4JJavaError: An error occurred while calling 
None.org.apache.spark.sql.hive.HiveContext.
: java.lang.RuntimeException: java.io.IOException: Filesystem closed
        at 
org.apache.hadoop.hive.ql.session.SessionState.start(SessionState.java:522)
        at 
org.apache.spark.sql.hive.client.ClientWrapper.<init>(ClientWrapper.scala:171)
        at 
org.apache.spark.sql.hive.HiveContext.executionHive$lzycompute(HiveContext.scala:162)
        at 
org.apache.spark.sql.hive.HiveContext.executionHive(HiveContext.scala:160)
        at org.apache.spark.sql.hive.HiveContext.<init>(HiveContext.scala:167)
        at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
        at 
sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62)
        at 
sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
        at java.lang.reflect.Constructor.newInstance(Constructor.java:422)
        at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:234)
        at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:379)
        at py4j.Gateway.invoke(Gateway.java:214)
        at 
py4j.commands.ConstructorCommand.invokeConstructor(ConstructorCommand.java:79)
        at py4j.commands.ConstructorCommand.execute(ConstructorCommand.java:68)
        at py4j.GatewayConnection.run(GatewayConnection.java:207)
        at java.lang.Thread.run(Thread.java:745)
Caused by: java.io.IOException: Filesystem closed
        at org.apache.hadoop.hdfs.DFSClient.checkOpen(DFSClient.java:323)
        at org.apache.hadoop.hdfs.DFSClient.getFileInfo(DFSClient.java:1057)
        at 
org.apache.hadoop.hdfs.DistributedFileSystem.getFileStatus(DistributedFileSystem.java:554)
        at 
org.apache.hadoop.hive.ql.session.SessionState.createRootHDFSDir(SessionState.java:599)
        at 
org.apache.hadoop.hive.ql.session.SessionState.createSessionDirs(SessionState.java:554)
        at 
org.apache.hadoop.hive.ql.session.SessionState.start(SessionState.java:508)
        ... 15 more


During handling of the above exception, another exception occurred:

Exception                                 Traceback (most recent call last)
<ipython-input-1-0beb490d573c> in <module>()
      5   (1, "I wish Java could use case classes"),
      6   (2, "Logistic,regression,models,are,neat")
----> 7 ], ["label", "sentence"])
      8 tokenizer = Tokenizer(inputCol="sentence", outputCol="words")
      9 wordsDataFrame = tokenizer.transform(sentenceDataFrame)

/root/spark/python/pyspark/sql/context.py in createDataFrame(self, data, 
schema, samplingRatio)
    406             rdd, schema = self._createFromLocal(data, schema)
    407         jrdd = 
self._jvm.SerDeUtil.toJavaArray(rdd._to_java_object_rdd())
--> 408         jdf = self._ssql_ctx.applySchemaToPythonRDD(jrdd.rdd(), 
schema.json())
    409         df = DataFrame(jdf, self)
    410         df._schema = schema

/root/spark/python/pyspark/sql/context.py in _ssql_ctx(self)
    658             raise Exception("You must build Spark with Hive. "
    659                             "Export 'SPARK_HIVE=true' and run "
--> 660                             "build/sbt assembly", e)
    661 
    662     def _get_hive_ctx(self):

Exception: ("You must build Spark with Hive. Export 'SPARK_HIVE=true' and run 
build/sbt assembly", Py4JJavaError('An error occurred while calling 
None.org.apache.spark.sql.hive.HiveContext.\n', JavaObject id=o38))



> spark-1.5.1-bin-hadoop2.6;  pyspark.ml.feature  Exception: ("You must build 
> Spark with Hive 
> --------------------------------------------------------------------------------------------
>
>                 Key: SPARK-12110
>                 URL: https://issues.apache.org/jira/browse/SPARK-12110
>             Project: Spark
>          Issue Type: Bug
>          Components: EC2
>    Affects Versions: 1.5.1
>         Environment: cluster created using 
> spark-1.5.1-bin-hadoop2.6/ec2/spark-ec2
>            Reporter: Andrew Davidson
>
> I am using spark-1.5.1-bin-hadoop2.6. I used 
> spark-1.5.1-bin-hadoop2.6/ec2/spark-ec2 to create a cluster and configured 
> spark-env to use python3. I can not run the tokenizer sample code. Is there a 
> work around?
> Kind regards
> Andy
> {code}
> /root/spark/python/pyspark/sql/context.py in _ssql_ctx(self)
>     658             raise Exception("You must build Spark with Hive. "
>     659                             "Export 'SPARK_HIVE=true' and run "
> --> 660                             "build/sbt assembly", e)
>     661 
>     662     def _get_hive_ctx(self):
> Exception: ("You must build Spark with Hive. Export 'SPARK_HIVE=true' and run 
> build/sbt assembly", Py4JJavaError('An error occurred while calling 
> None.org.apache.spark.sql.hive.HiveContext.\n', JavaObject id=o38))
> http://spark.apache.org/docs/latest/ml-features.html#tokenizer
> from pyspark.ml.feature import Tokenizer, RegexTokenizer
> sentenceDataFrame = sqlContext.createDataFrame([
>   (0, "Hi I heard about Spark"),
>   (1, "I wish Java could use case classes"),
>   (2, "Logistic,regression,models,are,neat")
> ], ["label", "sentence"])
> tokenizer = Tokenizer(inputCol="sentence", outputCol="words")
> wordsDataFrame = tokenizer.transform(sentenceDataFrame)
> for words_label in wordsDataFrame.select("words", "label").take(3):
>   print(words_label)
> ---------------------------------------------------------------------------
> Py4JJavaError                             Traceback (most recent call last)
> /root/spark/python/pyspark/sql/context.py in _ssql_ctx(self)
>     654             if not hasattr(self, '_scala_HiveContext'):
> --> 655                 self._scala_HiveContext = self._get_hive_ctx()
>     656             return self._scala_HiveContext
> /root/spark/python/pyspark/sql/context.py in _get_hive_ctx(self)
>     662     def _get_hive_ctx(self):
> --> 663         return self._jvm.HiveContext(self._jsc.sc())
>     664 
> /root/spark/python/lib/py4j-0.8.2.1-src.zip/py4j/java_gateway.py in 
> __call__(self, *args)
>     700         return_value = get_return_value(answer, self._gateway_client, 
> None,
> --> 701                 self._fqn)
>     702 
> /root/spark/python/pyspark/sql/utils.py in deco(*a, **kw)
>      35         try:
> ---> 36             return f(*a, **kw)
>      37         except py4j.protocol.Py4JJavaError as e:
> /root/spark/python/lib/py4j-0.8.2.1-src.zip/py4j/protocol.py in 
> get_return_value(answer, gateway_client, target_id, name)
>     299                     'An error occurred while calling {0}{1}{2}.\n'.
> --> 300                     format(target_id, '.', name), value)
>     301             else:
> Py4JJavaError: An error occurred while calling 
> None.org.apache.spark.sql.hive.HiveContext.
> : java.lang.RuntimeException: java.io.IOException: Filesystem closed
>       at 
> org.apache.hadoop.hive.ql.session.SessionState.start(SessionState.java:522)
>       at 
> org.apache.spark.sql.hive.client.ClientWrapper.<init>(ClientWrapper.scala:171)
>       at 
> org.apache.spark.sql.hive.HiveContext.executionHive$lzycompute(HiveContext.scala:162)
>       at 
> org.apache.spark.sql.hive.HiveContext.executionHive(HiveContext.scala:160)
>       at org.apache.spark.sql.hive.HiveContext.<init>(HiveContext.scala:167)
>       at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
>       at 
> sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62)
>       at 
> sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
>       at java.lang.reflect.Constructor.newInstance(Constructor.java:422)
>       at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:234)
>       at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:379)
>       at py4j.Gateway.invoke(Gateway.java:214)
>       at 
> py4j.commands.ConstructorCommand.invokeConstructor(ConstructorCommand.java:79)
>       at py4j.commands.ConstructorCommand.execute(ConstructorCommand.java:68)
>       at py4j.GatewayConnection.run(GatewayConnection.java:207)
>       at java.lang.Thread.run(Thread.java:745)
> Caused by: java.io.IOException: Filesystem closed
>       at org.apache.hadoop.hdfs.DFSClient.checkOpen(DFSClient.java:323)
>       at org.apache.hadoop.hdfs.DFSClient.getFileInfo(DFSClient.java:1057)
>       at 
> org.apache.hadoop.hdfs.DistributedFileSystem.getFileStatus(DistributedFileSystem.java:554)
>       at 
> org.apache.hadoop.hive.ql.session.SessionState.createRootHDFSDir(SessionState.java:599)
>       at 
> org.apache.hadoop.hive.ql.session.SessionState.createSessionDirs(SessionState.java:554)
>       at 
> org.apache.hadoop.hive.ql.session.SessionState.start(SessionState.java:508)
>       ... 15 more
> During handling of the above exception, another exception occurred:
> Exception                                 Traceback (most recent call last)
> <ipython-input-1-0beb490d573c> in <module>()
>       5   (1, "I wish Java could use case classes"),
>       6   (2, "Logistic,regression,models,are,neat")
> ----> 7 ], ["label", "sentence"])
>       8 tokenizer = Tokenizer(inputCol="sentence", outputCol="words")
>       9 wordsDataFrame = tokenizer.transform(sentenceDataFrame)
> /root/spark/python/pyspark/sql/context.py in createDataFrame(self, data, 
> schema, samplingRatio)
>     406             rdd, schema = self._createFromLocal(data, schema)
>     407         jrdd = 
> self._jvm.SerDeUtil.toJavaArray(rdd._to_java_object_rdd())
> --> 408         jdf = self._ssql_ctx.applySchemaToPythonRDD(jrdd.rdd(), 
> schema.json())
>     409         df = DataFrame(jdf, self)
>     410         df._schema = schema
> /root/spark/python/pyspark/sql/context.py in _ssql_ctx(self)
>     658             raise Exception("You must build Spark with Hive. "
>     659                             "Export 'SPARK_HIVE=true' and run "
> --> 660                             "build/sbt assembly", e)
>     661 
>     662     def _get_hive_ctx(self):
> Exception: ("You must build Spark with Hive. Export 'SPARK_HIVE=true' and run 
> build/sbt assembly", Py4JJavaError('An error occurred while calling 
> None.org.apache.spark.sql.hive.HiveContext.\n', JavaObject id=o38))
> {code}



--
This message was sent by Atlassian JIRA
(v6.3.4#6332)

---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org
For additional commands, e-mail: issues-h...@spark.apache.org

Reply via email to