Hello:
I need some help with my pyspark config, as shown below. I don't know if there
is a config issue or if I am missing jar files. Could someone please help to
have a working pyspark config?
I am using a standalone spark install on my server with jupyter lab. My goal is
to create an Iceberg table using pyspark and then insert data into the Iceberg
table. Once the config issue is resolved, and the Iceberg table is created, I
can take it from there.
Again, I would greatly appreciate any help you can give.
Jupyter Lab code:-
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark import SparkConf, SparkContext
# Setup the Configuration
sparkConf = pyspark.SparkConf()
#sparkConf.set("spark.sql.extensions",
"org.apache.iceberg.spark.extensions.IcebergsparkSessionExtensions")
sparkConf.set("spark.sql.catalog.jay_catalog",
"org.apache.iceberg.spark.SparkSessionCatalog")
sparkConf.set("spark.sql.defaultCatalog ", "jay_catalog")
sparkConf.set("spark.sql.catalog.jay_catalog.type", "hive")
sparkConf.set("spark.jars", "/opt/spark/jars/hive-metastore-2.3.9.jar,
/opt/spark/jars/spark-hive-thriftserver_2.12-3.2.0.jar,
/opt/spark/jars/spark-hive_2.12-3.2.0.jar")
sparkConf.set("spark.sql.hive.metastore.jars",
"/opt/spark/jars/hive-metastore-2.3.9.jar,
/opt/spark/jars/spark-hive-thriftserver_2.12-3.2.0.jar,
/opt/spark/jars/spark-hive_2.12-3.2.0.jar")
sparkConf.set("spark.jars.packages",
"org.apache.iceberg:iceberg-spark-runtime-3.3_2.12:1.1.0")
sparkConf.set("spark.sql.execution.pyarrow.enabled", "true")
sparkConf.set("spark.sql.catalog.jay_catalog.uri", "thrift://localhost:9083")
sparkConf.set("hive.metastore.uris", "thrift://localhost:9083")
sparkConf.set("spark.sql.catalog.jay_catalog.warehouse",
"/opt/spark/pysparkjay/spark-warehouse")
spark2 = SparkSession.builder \
.appName("Iceberg App") \
.master("local[12]") \
.config(conf=sparkConf) \
.enableHiveSupport() \
.getOrCreate()
print("Spark2 Running")
# Creating Table in SpqrkSQL
spark2.sql( \
"""CREATE TABLE IF NOT EXISTS jay_catalog.db.patient_ice \
( \
P_id bigint, \
P_gender string, \
P_DOB timestamp, \
P_race string\
) \
USING iceberg \
PARTITIONED BY (months(P_DOB))""" \
);
Getting errors as shown below.
Py4JJavaError Traceback (most recent call last)
<ipython-input-23-46552fb57bf5> in <module>
1 # Creating Table in SpqrkSQL
----> 2 spark2.sql( \
3 """CREATE TABLE IF NOT EXISTS jay_catalog.db.patient_ice \
4 ( \
5 P_id bigint, \
/opt/spark/python/pyspark/sql/session.py in sql(self, sqlQuery)
721 [Row(f1=1, f2='row1'), Row(f1=2, f2='row2'), Row(f1=3,
f2='row3')]
722 """
--> 723 return DataFrame(self._jsparkSession.sql(sqlQuery),
self._wrapped)
724
725 def table(self, tableName):
~/.local/lib/python3.8/site-packages/py4j/java_gateway.py in __call__(self,
*args)
1302
1303 answer = self.gateway_client.send_command(command)
-> 1304 return_value = get_return_value(
1305 answer, self.gateway_client, self.target_id, self.name)
1306
/opt/spark/python/pyspark/sql/utils.py in deco(*a, **kw)
109 def deco(*a, **kw):
110 try:
--> 111 return f(*a, **kw)
112 except py4j.protocol.Py4JJavaError as e:
113 converted = convert_exception(e.java_exception)
~/.local/lib/python3.8/site-packages/py4j/protocol.py in
get_return_value(answer, gateway_client, target_id, name)
324 value = OUTPUT_CONVERTER[type](answer[2:], gateway_client)
325 if answer[1] == REFERENCE_TYPE:
--> 326 raise Py4JJavaError(
327 "An error occurred while calling {0}{1}{2}.\n".
328 format(target_id, ".", name), value)
Py4JJavaError: An error occurred while calling o43.sql.
: org.apache.iceberg.hive.RuntimeMetaException: Failed to connect to Hive
Metastore
at
org.apache.iceberg.hive.HiveClientPool.newClient(HiveClientPool.java:84)
at
org.apache.iceberg.hive.HiveClientPool.newClient(HiveClientPool.java:34)
at org.apache.iceberg.ClientPoolImpl.get(ClientPoolImpl.java:125)
at org.apache.iceberg.ClientPoolImpl.run(ClientPoolImpl.java:56)
at org.apache.iceberg.ClientPoolImpl.run(ClientPoolImpl.java:51)
at org.apache.iceberg.hive.CachedClientPool.run(CachedClientPool.java:82)
at
org.apache.iceberg.hive.HiveTableOperations.doRefresh(HiveTableOperations.java:223)
at
org.apache.iceberg.BaseMetastoreTableOperations.refresh(BaseMetastoreTableOperations.java:97)
at
org.apache.iceberg.BaseMetastoreTableOperations.current(BaseMetastoreTableOperations.java:80)
at
org.apache.iceberg.BaseMetastoreCatalog.loadTable(BaseMetastoreCatalog.java:44)
at
org.apache.iceberg.shaded.com.github.benmanes.caffeine.cache.BoundedLocalCache.lambda$doComputeIfAbsent$14(BoundedLocalCache.java:2406)
at
java.base/java.util.concurrent.ConcurrentHashMap.compute(ConcurrentHashMap.java:1908)
at
org.apache.iceberg.shaded.com.github.benmanes.caffeine.cache.BoundedLocalCache.doComputeIfAbsent(BoundedLocalCache.java:2404)
at
org.apache.iceberg.shaded.com.github.benmanes.caffeine.cache.BoundedLocalCache.computeIfAbsent(BoundedLocalCache.java:2387)
at
org.apache.iceberg.shaded.com.github.benmanes.caffeine.cache.LocalCache.computeIfAbsent(LocalCache.java:108)
at
org.apache.iceberg.shaded.com.github.benmanes.caffeine.cache.LocalManualCache.get(LocalManualCache.java:62)
at org.apache.iceberg.CachingCatalog.loadTable(CachingCatalog.java:166)
at org.apache.iceberg.spark.SparkCatalog.load(SparkCatalog.java:608)
at org.apache.iceberg.spark.SparkCatalog.loadTable(SparkCatalog.java:145)
at
org.apache.iceberg.spark.SparkSessionCatalog.loadTable(SparkSessionCatalog.java:134)
at
org.apache.spark.sql.connector.catalog.TableCatalog.tableExists(TableCatalog.java:119)
at
org.apache.spark.sql.execution.datasources.v2.CreateTableExec.run(CreateTableExec.scala:40)
at
org.apache.spark.sql.execution.datasources.v2.V2CommandExec.result$lzycompute(V2CommandExec.scala:43)
at
org.apache.spark.sql.execution.datasources.v2.V2CommandExec.result(V2CommandExec.scala:43)
at
org.apache.spark.sql.execution.datasources.v2.V2CommandExec.executeCollect(V2CommandExec.scala:49)
at
org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.$anonfun$applyOrElse$1(QueryExecution.scala:110)
at
org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$5(SQLExecution.scala:103)
at
org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:163)
at
org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:90)
at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:775)
at
org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64)
at
org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:110)
at
org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:106)
at
org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$transformDownWithPruning$1(TreeNode.scala:481)
at
org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(TreeNode.scala:82)
at
org.apache.spark.sql.catalyst.trees.TreeNode.transformDownWithPruning(TreeNode.scala:481)
at
org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.org$apache$spark$sql$catalyst$plans$logical$AnalysisHelper$$super$transformDownWithPruning(LogicalPlan.scala:30)
at
org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning(AnalysisHelper.scala:267)
at
org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning$(AnalysisHelper.scala:263)
at
org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:30)
at
org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:30)
at
org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:457)
at
org.apache.spark.sql.execution.QueryExecution.eagerlyExecuteCommands(QueryExecution.scala:106)
at
org.apache.spark.sql.execution.QueryExecution.commandExecuted$lzycompute(QueryExecution.scala:93)
at
org.apache.spark.sql.execution.QueryExecution.commandExecuted(QueryExecution.scala:91)
at org.apache.spark.sql.Dataset.<init>(Dataset.scala:219)
at org.apache.spark.sql.Dataset$.$anonfun$ofRows$2(Dataset.scala:99)
at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:775)
at org.apache.spark.sql.Dataset$.ofRows(Dataset.scala:96)
at
org.apache.spark.sql.SparkSession.$anonfun$sql$1(SparkSession.scala:618)
at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:775)
at org.apache.spark.sql.SparkSession.sql(SparkSession.scala:613)
at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native
Method)
at
java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at
java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.base/java.lang.reflect.Method.invoke(Method.java:566)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:282)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at
py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
at java.base/java.lang.Thread.run(Thread.java:829)
Caused by: java.lang.RuntimeException: Unable to instantiate
org.apache.hadoop.hive.metastore.HiveMetaStoreClient
at
org.apache.hadoop.hive.metastore.MetaStoreUtils.newInstance(MetaStoreUtils.java:1742)
at
org.apache.hadoop.hive.metastore.RetryingMetaStoreClient.<init>(RetryingMetaStoreClient.java:83)
at
org.apache.hadoop.hive.metastore.RetryingMetaStoreClient.getProxy(RetryingMetaStoreClient.java:133)
at
org.apache.hadoop.hive.metastore.RetryingMetaStoreClient.getProxy(RetryingMetaStoreClient.java:104)
at
org.apache.hadoop.hive.metastore.RetryingMetaStoreClient.getProxy(RetryingMetaStoreClient.java:97)
at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native
Method)
at
java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at
java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.base/java.lang.reflect.Method.invoke(Method.java:566)
at
org.apache.iceberg.common.DynMethods$UnboundMethod.invokeChecked(DynMethods.java:60)
at
org.apache.iceberg.common.DynMethods$UnboundMethod.invoke(DynMethods.java:72)
at
org.apache.iceberg.common.DynMethods$StaticMethod.invoke(DynMethods.java:185)
at
org.apache.iceberg.hive.HiveClientPool.newClient(HiveClientPool.java:63)
... 63 more
Caused by: java.lang.reflect.InvocationTargetException
at
java.base/jdk.internal.reflect.NativeConstructorAccessorImpl.newInstance0(Native
Method)
at
java.base/jdk.internal.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62)
at
java.base/jdk.internal.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
at
java.base/java.lang.reflect.Constructor.newInstance(Constructor.java:490)
at
org.apache.hadoop.hive.metastore.MetaStoreUtils.newInstance(MetaStoreUtils.java:1740)
... 75 more
Caused by: MetaException(message:Could not connect to meta store using any of
the URIs provided. Most recent failure:
org.apache.thrift.transport.TTransportException: java.net.ConnectException:
Connection refused (Connection refused)
at org.apache.thrift.transport.TSocket.open(TSocket.java:226)
at
org.apache.hadoop.hive.metastore.HiveMetaStoreClient.open(HiveMetaStoreClient.java:478)
at
org.apache.hadoop.hive.metastore.HiveMetaStoreClient.<init>(HiveMetaStoreClient.java:245)
at
java.base/jdk.internal.reflect.NativeConstructorAccessorImpl.newInstance0(Native
Method)
at
java.base/jdk.internal.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62)
at
java.base/jdk.internal.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
at
java.base/java.lang.reflect.Constructor.newInstance(Constructor.java:490)
at
org.apache.hadoop.hive.metastore.MetaStoreUtils.newInstance(MetaStoreUtils.java:1740)
at
org.apache.hadoop.hive.metastore.RetryingMetaStoreClient.<init>(RetryingMetaStoreClient.java:83)
at
org.apache.hadoop.hive.metastore.RetryingMetaStoreClient.getProxy(RetryingMetaStoreClient.java:133)
at
org.apache.hadoop.hive.metastore.RetryingMetaStoreClient.getProxy(RetryingMetaStoreClient.java:104)
at
org.apache.hadoop.hive.metastore.RetryingMetaStoreClient.getProxy(RetryingMetaStoreClient.java:97)
at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native
Method)
at
java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at
java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.base/java.lang.reflect.Method.invoke(Method.java:566)
at
org.apache.iceberg.common.DynMethods$UnboundMethod.invokeChecked(DynMethods.java:60)
at
org.apache.iceberg.common.DynMethods$UnboundMethod.invoke(DynMethods.java:72)
at
org.apache.iceberg.common.DynMethods$StaticMethod.invoke(DynMethods.java:185)
at
org.apache.iceberg.hive.HiveClientPool.newClient(HiveClientPool.java:63)
at
org.apache.iceberg.hive.HiveClientPool.newClient(HiveClientPool.java:34)
at org.apache.iceberg.ClientPoolImpl.get(ClientPoolImpl.java:125)
at org.apache.iceberg.ClientPoolImpl.run(ClientPoolImpl.java:56)
at org.apache.iceberg.ClientPoolImpl.run(ClientPoolImpl.java:51)
at org.apache.iceberg.hive.CachedClientPool.run(CachedClientPool.java:82)
at
org.apache.iceberg.hive.HiveTableOperations.doRefresh(HiveTableOperations.java:223)
at
org.apache.iceberg.BaseMetastoreTableOperations.refresh(BaseMetastoreTableOperations.java:97)
at
org.apache.iceberg.BaseMetastoreTableOperations.current(BaseMetastoreTableOperations.java:80)
at
org.apache.iceberg.BaseMetastoreCatalog.loadTable(BaseMetastoreCatalog.java:44)
at
org.apache.iceberg.shaded.com.github.benmanes.caffeine.cache.BoundedLocalCache.lambda$doComputeIfAbsent$14(BoundedLocalCache.java:2406)
at
java.base/java.util.concurrent.ConcurrentHashMap.compute(ConcurrentHashMap.java:1908)
at
org.apache.iceberg.shaded.com.github.benmanes.caffeine.cache.BoundedLocalCache.doComputeIfAbsent(BoundedLocalCache.java:2404)
at
org.apache.iceberg.shaded.com.github.benmanes.caffeine.cache.BoundedLocalCache.computeIfAbsent(BoundedLocalCache.java:2387)
at
org.apache.iceberg.shaded.com.github.benmanes.caffeine.cache.LocalCache.computeIfAbsent(LocalCache.java:108)
at
org.apache.iceberg.shaded.com.github.benmanes.caffeine.cache.LocalManualCache.get(LocalManualCache.java:62)
at org.apache.iceberg.CachingCatalog.loadTable(CachingCatalog.java:166)
at org.apache.iceberg.spark.SparkCatalog.load(SparkCatalog.java:608)
at org.apache.iceberg.spark.SparkCatalog.loadTable(SparkCatalog.java:145)
at
org.apache.iceberg.spark.SparkSessionCatalog.loadTable(SparkSessionCatalog.java:134)
at
org.apache.spark.sql.connector.catalog.TableCatalog.tableExists(TableCatalog.java:119)
at
org.apache.spark.sql.execution.datasources.v2.CreateTableExec.run(CreateTableExec.scala:40)
at
org.apache.spark.sql.execution.datasources.v2.V2CommandExec.result$lzycompute(V2CommandExec.scala:43)
at
org.apache.spark.sql.execution.datasources.v2.V2CommandExec.result(V2CommandExec.scala:43)
at
org.apache.spark.sql.execution.datasources.v2.V2CommandExec.executeCollect(V2CommandExec.scala:49)
at
org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.$anonfun$applyOrElse$1(QueryExecution.scala:110)
at
org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$5(SQLExecution.scala:103)
at
org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:163)
at
org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:90)
at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:775)
at
org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64)
at
org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:110)
at
org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:106)
at
org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$transformDownWithPruning$1(TreeNode.scala:481)
at
org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(TreeNode.scala:82)
at
org.apache.spark.sql.catalyst.trees.TreeNode.transformDownWithPruning(TreeNode.scala:481)
at
org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.org$apache$spark$sql$catalyst$plans$logical$AnalysisHelper$$super$transformDownWithPruning(LogicalPlan.scala:30)
at
org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning(AnalysisHelper.scala:267)
at
org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning$(AnalysisHelper.scala:263)
at
org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:30)
at
org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:30)
at
org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:457)
at
org.apache.spark.sql.execution.QueryExecution.eagerlyExecuteCommands(QueryExecution.scala:106)
at
org.apache.spark.sql.execution.QueryExecution.commandExecuted$lzycompute(QueryExecution.scala:93)
at
org.apache.spark.sql.execution.QueryExecution.commandExecuted(QueryExecution.scala:91)
at org.apache.spark.sql.Dataset.<init>(Dataset.scala:219)
at org.apache.spark.sql.Dataset$.$anonfun$ofRows$2(Dataset.scala:99)
at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:775)
at org.apache.spark.sql.Dataset$.ofRows(Dataset.scala:96)
at
org.apache.spark.sql.SparkSession.$anonfun$sql$1(SparkSession.scala:618)
at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:775)
at org.apache.spark.sql.SparkSession.sql(SparkSession.scala:613)
at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native
Method)
at
java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at
java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.base/java.lang.reflect.Method.invoke(Method.java:566)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:282)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at
py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
at java.base/java.lang.Thread.run(Thread.java:829)
Caused by: java.net.ConnectException: Connection refused (Connection refused)
at java.base/java.net.PlainSocketImpl.socketConnect(Native Method)
at
java.base/java.net.AbstractPlainSocketImpl.doConnect(AbstractPlainSocketImpl.java:399)
at
java.base/java.net.AbstractPlainSocketImpl.connectToAddress(AbstractPlainSocketImpl.java:242)
at
java.base/java.net.AbstractPlainSocketImpl.connect(AbstractPlainSocketImpl.java:224)
at java.base/java.net.SocksSocketImpl.connect(SocksSocketImpl.java:392)
at java.base/java.net.Socket.connect(Socket.java:609)
at org.apache.thrift.transport.TSocket.open(TSocket.java:221)
... 82 more
)
at
org.apache.hadoop.hive.metastore.HiveMetaStoreClient.open(HiveMetaStoreClient.java:527)
at org.apache.hadoop.hive.metastore.Hi