soumilshah1995 commented on issue #353: URL: https://github.com/apache/incubator-xtable/issues/353#issuecomment-1983449032
Hey @the-other-tim-brown I started some work with glue and onetable on delta streamer here is what I am doing and I know 99% its jar issue # Step 1: Upload jar to S3 ``` s3://XX/jar/hudi-spark3.3-bundle_2.12-0.14.0.jar s3://XX/jar/hudi-utilities-slim-bundle_2.12-0.14.0.jar s3://XXjar/jcommander-1.78.jar s3://XX/jar/hudi-extensions-0.1.0-SNAPSHOT-bundled.jar s3://XX/jar/hudi-java-client-0.14.0.jar ``` # Step 2: Upload Sample dataset inside test folder Link. ; https://drive.google.com/drive/folders/1BwNEK649hErbsWcYLZhqCWnaXFX3mIsg?usp=share_link # Stop 3: Create Glue job with Delta Streamer and onetable ``` import com.amazonaws.services.glue.GlueContext import com.amazonaws.services.glue.MappingSpec import com.amazonaws.services.glue.errors.CallSite import com.amazonaws.services.glue.util.GlueArgParser import com.amazonaws.services.glue.util.Job import com.amazonaws.services.glue.util.JsonOptions import org.apache.spark.SparkContext import scala.collection.JavaConverters._ import org.apache.spark.sql.SparkSession import org.apache.spark.api.java.JavaSparkContext import org.apache.hudi.utilities.streamer.HoodieStreamer import org.apache.hudi.utilities.streamer.SchedulerConfGenerator import org.apache.hudi.utilities.UtilHelpers import com.beust.jcommander.JCommander; import com.beust.jcommander.Parameter; object GlueApp { def main(sysArgs: Array[String]) { val args = GlueArgParser.getResolvedOptions(sysArgs, Seq("JOB_NAME").toArray) val BUCKET = "apply-systems-qa" var config = Array( "--source-class", "org.apache.hudi.utilities.sources.ParquetDFSSource", "--source-ordering-field", "replicadmstimestamp", s"--target-base-path", s"s3://$BUCKET/testcases/", "--target-table", "invoice", "--table-type" , "COPY_ON_WRITE", "--hoodie-conf", "hoodie.datasource.write.keygenerator.class=org.apache.hudi.keygen.SimpleKeyGenerator", "--hoodie-conf", "hoodie.datasource.write.recordkey.field=invoiceid", "--hoodie-conf", "hoodie.datasource.write.partitionpath.field=destinationstate", s"--hoodie-conf", s"hoodie.streamer.source.dfs.root=s3://$BUCKET/test/", "--hoodie-conf", "hoodie.datasource.write.precombine.field=replicadmstimestamp" ) val cfg = HoodieStreamer.getConfig(config) val additionalSparkConfigs = SchedulerConfGenerator.getSparkSchedulingConfigs(cfg) val jssc = UtilHelpers.buildSparkContext("delta-streamer-test", "jes", additionalSparkConfigs) val spark = jssc.sc val glueContext: GlueContext = new GlueContext(spark) Job.init(args("JOB_NAME"), glueContext, args.asJava) try { new HoodieStreamer(cfg, jssc).sync(); } finally { jssc.stop(); } Job.commiimport com .amazonaws.services.glue.GlueContext import com.amazonaws.services.glue.MappingSpec import com.amazonaws.services.glue.errors.CallSite import com.amazonaws.services.glue.util.GlueArgParser import com.amazonaws.services.glue.util.Job import com.amazonaws.services.glue.util.JsonOptions import org.apache.spark.SparkContext import scala.collection.JavaConverters._ import org.apache.spark.sql.SparkSession import org.apache.spark.api.java.JavaSparkContext import org.apache.hudi.utilities.streamer.HoodieStreamer import org.apache.hudi.utilities.streamer.SchedulerConfGenerator import org.apache.hudi.utilities.UtilHelpers import com.beust.jcommander.JCommander; import com.beust.jcommander.Parameter; object GlueApp { def main(sysArgs: Array[String]) { val args = GlueArgParser.getResolvedOptions(sysArgs, Seq("JOB_NAME").toArray) val BUCKET = "XX" var config = Array( "--source-class", "org.apache.hudi.utilities.sources.ParquetDFSSource", "--source-ordering-field", "replicadmstimestamp", s"--target-base-path", s"s3://$BUCKET/testcases/", "--target-table", "invoice", "--table-type", "COPY_ON_WRITE", "--sync-tool-classes", "io.onetable.hudi.sync.OneTableSyncTool", "--enable-sync", "--hoodie-conf", "hoodie.datasource.write.keygenerator.class=org.apache.hudi.keygen.SimpleKeyGenerator", "--hoodie-conf", "hoodie.datasource.write.recordkey.field=invoiceid", "--hoodie-conf", "hoodie.datasource.write.partitionpath.field=destinationstate", s"--hoodie-conf", s"hoodie.streamer.source.dfs.root=s3://$BUCKET/test/", "--hoodie-conf", "hoodie.datasource.write.precombine.field=replicadmstimestamp", "--hoodie-conf", "hoodie.onetable.formats.to.sync=DELTA", "--hoodie-conf", "hoodie.onetable.target.metadata.retention.hr=168", ) val cfg = HoodieStreamer.getConfig(config) val additionalSparkConfigs = SchedulerConfGenerator.getSparkSchedulingConfigs(cfg) val jssc = UtilHelpers.buildSparkContext("delta-streamer-test", "jes", additionalSparkConfigs) val spark = jssc.sc val glueContext: GlueContext = new GlueContext(spark) Job.init(args("JOB_NAME"), glueContext, args.asJava) try { new HoodieStreamer(cfg, jssc).sync(); } finally { jssc.stop(); } Job.commit() } } } } ``` # Glue side make to to add all jar as shown in image above ![image](https://github.com/apache/incubator-xtable/assets/39345855/4a735bce-4cd5-48a7-b00b-c31ae63c6871) ![Screenshot 2024-03-07 at 7 52 26 AM](https://github.com/apache/incubator-xtable/assets/39345855/6353934e-9d9d-4b48-a4e4-028faf58e297) Hudi Tables is created I think it fails sync tool same error from past which mean mostly Jar issue # Error ``` 2024-03-07 12:45:56,334 ERROR [main] glue.ProcessLauncher (Logging.scala:logError(98)): Exception in User Class org.apache.hudi.exception.HoodieMetaSyncException: Could not sync using the meta sync class io.onetable.hudi.sync.OneTableSyncTool at org.apache.hudi.sync.common.util.SyncUtilHelpers.runHoodieMetaSync(SyncUtilHelpers.java:81) ~[hudi-spark3.3-bundle_2.12-0.14.0.jar:0.14.0] at org.apache.hudi.utilities.streamer.StreamSync.runMetaSync(StreamSync.java:938) ~[hudi-utilities-slim-bundle_2.12-0.14.0.jar:0.14.0] at org.apache.hudi.utilities.streamer.StreamSync.writeToSink(StreamSync.java:851) ~[hudi-utilities-slim-bundle_2.12-0.14.0.jar:0.14.0] at org.apache.hudi.utilities.streamer.StreamSync.syncOnce(StreamSync.java:446) ~[hudi-utilities-slim-bundle_2.12-0.14.0.jar:0.14.0] at org.apache.hudi.utilities.streamer.HoodieStreamer$StreamSyncService.ingestOnce(HoodieStreamer.java:840) ~[hudi-utilities-slim-bundle_2.12-0.14.0.jar:0.14.0] at org.apache.hudi.utilities.ingestion.HoodieIngestionService.startIngestion(HoodieIngestionService.java:72) ~[hudi-utilities-slim-bundle_2.12-0.14.0.jar:0.14.0] at org.apache.hudi.common.util.Option.ifPresent(Option.java:97) ~[hudi-spark3.3-bundle_2.12-0.14.0.jar:0.14.0] at org.apache.hudi.utilities.streamer.HoodieStreamer.sync(HoodieStreamer.java:205) ~[hudi-utilities-slim-bundle_2.12-0.14.0.jar:0.14.0] at GlueApp$.main(test0.scala:50) ~[test0.scala.jar:?] at GlueApp.main(test0.scala) ~[test0.scala.jar:?] at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) ~[?:1.8.0_402] at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) ~[?:1.8.0_402] at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) ~[?:1.8.0_402] at java.lang.reflect.Method.invoke(Method.java:498) ~[?:1.8.0_402] at com.amazonaws.services.glue.SparkProcessLauncherPlugin.invoke(ProcessLauncher.scala:65) ~[AWSGlueSparkResourceManager-1.0.jar:?] at com.amazonaws.services.glue.SparkProcessLauncherPlugin.invoke$(ProcessLauncher.scala:65) ~[AWSGlueSparkResourceManager-1.0.jar:?] at com.amazonaws.services.glue.ProcessLauncher$$anon$2.invoke(ProcessLauncher.scala:212) ~[AWSGlueSparkResourceManager-1.0.jar:?] at com.amazonaws.services.glue.ProcessLauncher.launch(ProcessLauncher.scala:400) ~[AWSGlueSparkResourceManager-1.0.jar:?] at com.amazonaws.services.glue.ProcessLauncher$.main(ProcessLauncher.scala:45) ~[AWSGlueSparkResourceManager-1.0.jar:?] at com.amazonaws.services.glue.ProcessLauncher.main(ProcessLauncher.scala) ~[AWSGlueSparkResourceManager-1.0.jar:?] Caused by: java.lang.NoClassDefFoundError: org/apache/spark/sql/catalyst/expressions/Empty2Null at org.apache.spark.sql.delta.DeltaLog.startTransaction(DeltaLog.scala:214) ~[hudi-extensions-0.1.0-SNAPSHOT-bundled.jar:?] at org.apache.spark.sql.delta.DeltaLog.startTransaction(DeltaLog.scala:211) ~[hudi-extensions-0.1.0-SNAPSHOT-bundled.jar:?] at io.onetable.delta.DeltaClient$TransactionState.<init>(DeltaClient.java:232) ~[hudi-extensions-0.1.0-SNAPSHOT-bundled.jar:?] at io.onetable.delta.DeltaClient$TransactionState.<init>(DeltaClient.java:217) ~[hudi-extensions-0.1.0-SNAPSHOT-bundled.jar:?] at io.onetable.delta.DeltaClient.beginSync(DeltaClient.java:153) ~[hudi-extensions-0.1.0-SNAPSHOT-bundled.jar:?] at io.onetable.spi.sync.TableFormatSync.getSyncResult(TableFormatSync.java:154) ~[hudi-extensions-0.1.0-SNAPSHOT-bundled.jar:?] at io.onetable.spi.sync.TableFormatSync.syncSnapshot(TableFormatSync.java:70) ~[hudi-extensions-0.1.0-SNAPSHOT-bundled.jar:?] at io.onetable.client.OneTableClient.syncSnapshot(OneTableClient.java:179) ~[hudi-extensions-0.1.0-SNAPSHOT-bundled.jar:?] at io.onetable.client.OneTableClient.sync(OneTableClient.java:116) ~[hudi-extensions-0.1.0-SNAPSHOT-bundled.jar:?] at io.onetable.hudi.sync.OneTableSyncTool.syncHoodieTable(OneTableSyncTool.java:80) ~[hudi-extensions-0.1.0-SNAPSHOT-bundled.jar:?] at org.apache.hudi.sync.common.util.SyncUtilHelpers.runHoodieMetaSync(SyncUtilHelpers.java:79) ~[hudi-spark3.3-bundle_2.12-0.14.0.jar:0.14.0] ... 19 more Caused by: java.lang.ClassNotFoundException: org.apache.spark.sql.catalyst.expressions.Empty2Null at java.net.URLClassLoader.findClass(URLClassLoader.java:387) ~[?:1.8.0_402] at java.lang.ClassLoader.loadClass(ClassLoader.java:418) ~[?:1.8.0_402] at sun.misc.Launcher$AppClassLoader.loadClass(Launcher.java:352) ~[?:1.8.0_402] at java.lang.ClassLoader.loadClass(ClassLoader.java:351) ~[?:1.8.0_402] at org.apache.spark.sql.delta.DeltaLog.startTransaction(DeltaLog.scala:214) ~[hudi-extensions-0.1.0-SNAPSHOT-bundled.jar:?] at org.apache.spark.sql.delta.DeltaLog.startTransaction(DeltaLog.scala:211) ~[hudi-extensions-0.1.0-SNAPSHOT-bundled.jar:?] at io.onetable.delta.DeltaClient$TransactionState.<init>(DeltaClient.java:232) ~[hudi-extensions-0.1.0-SNAPSHOT-bundled.jar:?] at io.onetable.delta.DeltaClient$TransactionState.<init>(DeltaClient.java:217) ~[hudi-extensions-0.1.0-SNAPSHOT-bundled.jar:?] at io.onetable.delta.DeltaClient.beginSync(DeltaClient.java:153) ~[hudi-extensions-0.1.0-SNAPSHOT-bundled.jar:?] at io.onetable.spi.sync.TableFormatSync.getSyncResult(TableFormatSync.java:154) ~[hudi-extensions-0.1.0-SNAPSHOT-bundled.jar:?] at io.onetable.spi.sync.TableFormatSync.syncSnapshot(TableFormatSync.java:70) ~[hudi-extensions-0.1.0-SNAPSHOT-bundled.jar:?] at io.onetable.client.OneTableClient.syncSnapshot(OneTableClient.java:179) ~[hudi-extensions-0.1.0-SNAPSHOT-bundled.jar:?] at io.onetable.client.OneTableClient.sync(OneTableClient.java:116) ~[hudi-extensions-0.1.0-SNAPSHOT-bundled.jar:?] at io.onetable.hudi.sync.OneTableSyncTool.syncHoodieTable(OneTableSyncTool.java:80) ~[hudi-extensions-0.1.0-SNAPSHOT-bundled.jar:?] at org.apache.hudi.sync.common.util.SyncUtilHelpers.runHoodieMetaSync(SyncUtilHelpers.java:79) ~[hudi-spark3.3-bundle_2.12-0.14.0.jar:0.14.0] ... 19 more ``` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@xtable.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org