[ https://issues.apache.org/jira/browse/HUDI-1568?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17288120#comment-17288120 ]
sivabalan narayanan edited comment on HUDI-1568 at 2/22/21, 1:44 AM: --------------------------------------------------------------------- Reported by customer as well : https://github.com/apache/hudi/issues/2566 was (Author: shivnarayan): Tried pyspark as per quick start utils for MERGE_ON_READ table. COW has no issues. Running into NosuchMethodError for org.apache.spark.sql.execution.datasources.InMemoryFileIndex.<init> ``` >>> df.write.format("hudi"). \ options(**hudi_options). \ mode("overwrite"). \ save(basePath) 21/02/21 20:14:40 WARN HoodieSparkSqlWriter$: hoodie table at [file:/tmp/hudi_trips_cow|file:///tmp/hudi_trips_cow] already exists. Deleting existing data & overwriting with new data. >>> >>> >>> tripsSnapshotDF = spark. \ read. \ format("hudi"). \ load(basePath + "/*/*/*/*") Traceback (most recent call last): File "<stdin>", line 1, in <module> File "/usr/local/Cellar/apache-spark/3.0.1/libexec/python/pyspark/sql/readwriter.py", line 178, in load return self._df(self._jreader.load(path)) File "/usr/local/Cellar/apache-spark/3.0.1/libexec/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py", line 1305, in __call__ File "/usr/local/Cellar/apache-spark/3.0.1/libexec/python/pyspark/sql/utils.py", line 128, in deco return f(*a, **kw) File "/usr/local/Cellar/apache-spark/3.0.1/libexec/python/lib/py4j-0.10.9-src.zip/py4j/protocol.py", line 328, in get_return_value py4j.protocol.Py4JJavaError: An error occurred while calling o64.load. : java.lang.NoSuchMethodError: 'void org.apache.spark.sql.execution.datasources.InMemoryFileIndex.<init>(org.apache.spark.sql.SparkSession, scala.collection.Seq, scala.collection.immutable.Map, scala.Option, org.apache.spark.sql.execution.datasources.FileStatusCache)' at org.apache.hudi.HoodieSparkUtils$.createInMemoryFileIndex(HoodieSparkUtils.scala:89) at org.apache.hudi.MergeOnReadSnapshotRelation.buildFileIndex(MergeOnReadSnapshotRelation.scala:127) at org.apache.hudi.MergeOnReadSnapshotRelation.<init>(MergeOnReadSnapshotRelation.scala:72) at org.apache.hudi.DefaultSource.createRelation(DefaultSource.scala:89) at org.apache.hudi.DefaultSource.createRelation(DefaultSource.scala:53) at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:344) at org.apache.spark.sql.DataFrameReader.loadV1Source(DataFrameReader.scala:297) at org.apache.spark.sql.DataFrameReader.$anonfun$load$2(DataFrameReader.scala:286) at scala.Option.getOrElse(Option.scala:189) at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:286) at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:232) at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method) at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) at java.base/java.lang.reflect.Method.invoke(Method.java:566) at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244) at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357) at py4j.Gateway.invoke(Gateway.java:282) at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132) at py4j.commands.CallCommand.execute(CallCommand.java:79) at py4j.GatewayConnection.run(GatewayConnection.java:238) at java.base/java.lang.Thread.run(Thread.java:834) ``` Tried the quick start utils for pyspark as is. These are the paths I had set ``` alias python='python3' alias python=/usr/local/bin/python3.9 alias pip=/usr/local/bin/pip3 export JAVA_HOME=/Library/java/JavaVirtualMachines/jdk1.8.0_192.jdk/Contents/Home/ export JRE_HOME=/Library/java/JavaVirtualMachines/jdk1.8.0_192.jdk/Contents/Home/jre/ export SPARK_HOME=/usr/local/Cellar/apache-spark/3.0.1/libexec/ export PATH=/usr/local/Cellar/apache-spark/3.0.1/bin:$PATH ``` > Issues w/ spark_bundle_2.12 : NoSuchMethodError: 'void > org.apache.spark.sql.execution.datasources.InMemoryFileIndex.<init> > -------------------------------------------------------------------------------------------------------------------------- > > Key: HUDI-1568 > URL: https://issues.apache.org/jira/browse/HUDI-1568 > Project: Apache Hudi > Issue Type: Bug > Components: Spark Integration > Reporter: sivabalan narayanan > Priority: Major > Labels: sev:critical, user-support-issues > > I tried Quick Start with hudi-spark-bundle_2.12 and it fails w/ > NoSuchMethodError: 'void > org.apache.spark.sql.execution.datasources.InMemoryFileIndex.<init> during > read. > This is happening only for MOR and not for COW. > Command used to invoke spark shell > > spark-3.0.1-bin-hadoop2.7/bin/spark-shell \ > --packages > org.apache.hudi:hudi-spark-bundle_2.12:0.7.0,org.apache.spark:spark-avro_2.12:3.0.1 > \ > --conf 'spark.serializer=org.apache.spark.serializer.KryoSerializer' > my local spark is spark-3.0.1. > {code:java} > . // usual steps as given in quick start utils. > . > . > scala> df.write.format("hudi"). > | options(getQuickstartWriteConfigs). > | option(PRECOMBINE_FIELD_OPT_KEY, "ts"). > | option(RECORDKEY_FIELD_OPT_KEY, "uuid"). > | option(PARTITIONPATH_FIELD_OPT_KEY, "partitionpath"). > | option(TABLE_NAME, tableName). > | option("hoodie.datasource.write.table.type","MERGE_ON_READ") > | mode(Overwrite). > | save(basePath) > val tripsSnapshotDF = spark. > read. > format("hudi"). > load(basePath + "/*/*/*/*") > java.lang.NoSuchMethodError: 'void > org.apache.spark.sql.execution.datasources.InMemoryFileIndex.<init>(org.apache.spark.sql.SparkSession, > scala.collection.Seq, scala.collection.immutable.Map, scala.Option, > org.apache.spark.sql.execution.datasources.FileStatusCache)' > at > org.apache.hudi.HoodieSparkUtils$.createInMemoryFileIndex(HoodieSparkUtils.scala:89) > at > org.apache.hudi.MergeOnReadSnapshotRelation.buildFileIndex(MergeOnReadSnapshotRelation.scala:127) > at > org.apache.hudi.MergeOnReadSnapshotRelation.<init>(MergeOnReadSnapshotRelation.scala:72) > at org.apache.hudi.DefaultSource.createRelation(DefaultSource.scala:89) > at org.apache.hudi.DefaultSource.createRelation(DefaultSource.scala:53) > at > org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:344) > at > org.apache.spark.sql.DataFrameReader.loadV1Source(DataFrameReader.scala:297) > at > org.apache.spark.sql.DataFrameReader.$anonfun$load$2(DataFrameReader.scala:286) > at scala.Option.getOrElse(Option.scala:189) > at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:286) > at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:232) > ... 62 elided > {code} > > > -- This message was sent by Atlassian Jira (v8.3.4#803005)