This is an automated email from the ASF dual-hosted git repository. codope pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/hudi.git
The following commit(s) were added to refs/heads/master by this push: new 31e13db1f0 [HUDI-4023] Decouple hudi-spark from hudi-utilities-slim-bundle (#5641) 31e13db1f0 is described below commit 31e13db1f0e12e107cc02c60dec3e52a8914a5b2 Author: Sagar Sumit <sagarsumi...@gmail.com> AuthorDate: Thu May 26 11:28:49 2022 +0530 [HUDI-4023] Decouple hudi-spark from hudi-utilities-slim-bundle (#5641) --- .../hudi/utilities/deltastreamer/DeltaSync.java | 2 - packaging/hudi-utilities-slim-bundle/README.md | 89 ++++++++++++- packaging/hudi-utilities-slim-bundle/pom.xml | 143 +++------------------ pom.xml | 7 + 4 files changed, 109 insertions(+), 132 deletions(-) diff --git a/hudi-utilities/src/main/java/org/apache/hudi/utilities/deltastreamer/DeltaSync.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/deltastreamer/DeltaSync.java index a4a7e10abc..0ae72f94b8 100644 --- a/hudi-utilities/src/main/java/org/apache/hudi/utilities/deltastreamer/DeltaSync.java +++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/deltastreamer/DeltaSync.java @@ -605,8 +605,6 @@ public class DeltaSync implements Serializable { long totalErrorRecords = writeStatusRDD.mapToDouble(WriteStatus::getTotalErrorRecords).sum().longValue(); long totalRecords = writeStatusRDD.mapToDouble(WriteStatus::getTotalRecords).sum().longValue(); boolean hasErrors = totalErrorRecords > 0; - long hiveSyncTimeMs = 0; - long metaSyncTimeMs = 0; if (!hasErrors || cfg.commitOnErrors) { HashMap<String, String> checkpointCommitMetadata = new HashMap<>(); if (checkpointStr != null) { diff --git a/packaging/hudi-utilities-slim-bundle/README.md b/packaging/hudi-utilities-slim-bundle/README.md index 58353c403d..60ee739153 100644 --- a/packaging/hudi-utilities-slim-bundle/README.md +++ b/packaging/hudi-utilities-slim-bundle/README.md @@ -17,6 +17,89 @@ # Usage of hudi-utilities-slim-bundle -Starting from versions 0.11, Hudi provides hudi-utilities-slim-bundle which excludes hudi-spark-datasource modules. -This new bundle is intended to be used with Hudi Spark bundle together, if using hudi-utilities-bundle solely -introduces problems for a specific Spark version. \ No newline at end of file +Starting from versions 0.11, Hudi provides hudi-utilities-slim-bundle which excludes hudi-spark-datasource modules. This new bundle is intended to be used with Hudi Spark bundle together, if using +hudi-utilities-bundle solely introduces problems for a specific Spark version. + +## Example with Spark 2.4.7 + +* Build Hudi: `mvn clean install -DskipTests` +* Run deltastreamer + +``` +bin/spark-submit \ + --driver-memory 4g --executor-memory 2g --num-executors 3 --executor-cores 1 \ + --conf spark.serializer=org.apache.spark.serializer.KryoSerializer \ + --conf spark.sql.catalogImplementation=hive \ + --conf spark.driver.maxResultSize=1g \ + --conf spark.ui.port=6679 \ + --packages org.apache.spark:spark-avro_2.11:2.4.7 \ + --jars /path/to/hudi/packaging/hudi-spark-bundle/target/hudi-spark-bundle_2.11-0.12.0-SNAPSHOT.jar \ + --class org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer `ls /path/to/hudi/packaging/hudi-utilities-slim-bundle/target/hudi-utilities-slim-bundle_2.11-0.12.0-SNAPSHOT.jar` \ + --props `ls /path/to/hudi/dfs-source.properties` \ + --source-class org.apache.hudi.utilities.sources.ParquetDFSSource \ + --schemaprovider-class org.apache.hudi.utilities.schema.FilebasedSchemaProvider \ + --source-ordering-field tpep_dropoff_datetime \ + --table-type COPY_ON_WRITE \ + --target-base-path file:\/\/\/tmp/hudi-ny-taxi-spark24/ \ + --target-table ny_hudi_tbl \ + --op UPSERT \ + --continuous \ + --source-limit 5000000 \ + --min-sync-interval-seconds 60 +``` + +## Example with Spark 3.1.2 + +* Build Hudi: `mvn clean install -DskipTests -Dspark3.1 -Dscala-2.12` +* Run deltastreamer + +``` +bin/spark-submit \ + --driver-memory 4g --executor-memory 2g --num-executors 3 --executor-cores 1 \ + --conf spark.serializer=org.apache.spark.serializer.KryoSerializer \ + --conf spark.sql.catalogImplementation=hive \ + --conf spark.driver.maxResultSize=1g \ + --conf spark.ui.port=6679 \ + --packages org.apache.spark:spark-avro_2.12:3.1.2 \ + --jars /path/to/hudi/packaging/hudi-spark-bundle/target/hudi-spark3.1-bundle_2.12-0.12.0-SNAPSHOT.jar \ + --class org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer `ls /path/to/hudi/packaging/hudi-utilities-slim-bundle/target/hudi-utilities-slim-bundle_2.12-0.12.0-SNAPSHOT.jar` \ + --props `ls /path/to/hudi/dfs-source.properties` \ + --source-class org.apache.hudi.utilities.sources.ParquetDFSSource \ + --schemaprovider-class org.apache.hudi.utilities.schema.FilebasedSchemaProvider \ + --source-ordering-field tpep_dropoff_datetime \ + --table-type COPY_ON_WRITE \ + --target-base-path file:\/\/\/tmp/hudi-ny-taxi-spark31/ \ + --target-table ny_hudi_tbl \ + --op UPSERT \ + --continuous \ + --source-limit 5000000 \ + --min-sync-interval-seconds 60 +``` + +## Example with Spark 3.2.0 + +* Build Hudi: `mvn clean install -DskipTests -Dspark3.2 -Dscala-2.12` +* Run deltastreamer + +``` +bin/spark-submit \ + --driver-memory 4g --executor-memory 2g --num-executors 3 --executor-cores 1 \ + --conf spark.serializer=org.apache.spark.serializer.KryoSerializer \ + --conf spark.sql.catalogImplementation=hive \ + --conf spark.driver.maxResultSize=1g \ + --conf spark.ui.port=6679 \ + --packages org.apache.spark:spark-avro_2.12:3.2.0 \ + --jars /path/to/hudi/packaging/hudi-spark-bundle/target/hudi-spark3.2-bundle_2.12-0.12.0-SNAPSHOT.jar \ + --class org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer `ls /path/to/hudi/packaging/hudi-utilities-slim-bundle/target/hudi-utilities-slim-bundle_2.12-0.12.0-SNAPSHOT.jar` \ + --props `ls /path/to/hudi/dfs-source.properties` \ + --source-class org.apache.hudi.utilities.sources.ParquetDFSSource \ + --schemaprovider-class org.apache.hudi.utilities.schema.FilebasedSchemaProvider \ + --source-ordering-field tpep_dropoff_datetime \ + --table-type COPY_ON_WRITE \ + --target-base-path file:\/\/\/tmp/hudi-ny-taxi-spark32/ \ + --target-table ny_hudi_tbl \ + --op UPSERT \ + --continuous \ + --source-limit 5000000 \ + --min-sync-interval-seconds 60 +``` diff --git a/packaging/hudi-utilities-slim-bundle/pom.xml b/packaging/hudi-utilities-slim-bundle/pom.xml index 60f0af9d64..993e2ad7fd 100644 --- a/packaging/hudi-utilities-slim-bundle/pom.xml +++ b/packaging/hudi-utilities-slim-bundle/pom.xml @@ -77,7 +77,7 @@ <transformer implementation="org.apache.maven.plugins.shade.resource.ApacheLicenseResourceTransformer"> </transformer> <transformer implementation="org.apache.maven.plugins.shade.resource.ApacheNoticeResourceTransformer"> - <addHeader>true</addHeader> + <addHeader>true</addHeader> </transformer> <transformer implementation="org.apache.maven.plugins.shade.resource.IncludeResourceTransformer"> <resource>META-INF/LICENSE</resource> @@ -92,10 +92,7 @@ <includes> <include>org.apache.hudi:hudi-common</include> <include>org.apache.hudi:hudi-client-common</include> - <include>org.apache.hudi:hudi-spark-client</include> <include>org.apache.hudi:hudi-utilities_${scala.binary.version}</include> - <include>org.apache.hudi:hudi-hive-sync</include> - <include>org.apache.hudi:hudi-sync-common</include> <include>org.apache.hudi:hudi-hadoop-mr</include> <include>org.apache.hudi:hudi-timeline-service</include> <include>org.apache.hudi:hudi-aws</include> @@ -136,13 +133,6 @@ <include>org.apache.kafka:kafka_${scala.binary.version}</include> <include>com.101tec:zkclient</include> <include>org.apache.kafka:kafka-clients</include> - - <include>org.apache.hive:hive-common</include> - <include>org.apache.hive:hive-service</include> - <include>org.apache.hive:hive-service-rpc</include> - <include>org.apache.hive:hive-metastore</include> - <include>org.apache.hive:hive-jdbc</include> - <include>org.apache.hbase:hbase-client</include> <include>org.apache.hbase:hbase-common</include> <include>org.apache.hbase:hbase-hadoop-compat</include> @@ -178,10 +168,6 @@ <pattern>com.beust.jcommander.</pattern> <shadedPattern>org.apache.hudi.com.beust.jcommander.</shadedPattern> </relocation> - <relocation> - <pattern>org.apache.hive.jdbc.</pattern> - <shadedPattern>${utilities.bundle.hive.shade.prefix}org.apache.hive.jdbc.</shadedPattern> - </relocation> <relocation> <pattern>org.apache.commons.io.</pattern> <shadedPattern>org.apache.hudi.org.apache.commons.io.</shadedPattern> @@ -205,10 +191,6 @@ <pattern>org.apache.hadoop.hive.metastore.</pattern> <shadedPattern>${utilities.bundle.hive.shade.prefix}org.apache.hadoop.hive.metastore.</shadedPattern> </relocation> - <relocation> - <pattern>org.apache.hive.common.</pattern> - <shadedPattern>${utilities.bundle.hive.shade.prefix}org.apache.hive.common.</shadedPattern> - </relocation> <relocation> <pattern>org.apache.hadoop.hive.common.</pattern> <shadedPattern>${utilities.bundle.hive.shade.prefix}org.apache.hadoop.hive.common.</shadedPattern> @@ -217,10 +199,6 @@ <pattern>org.apache.hadoop.hive.conf.</pattern> <shadedPattern>${utilities.bundle.hive.shade.prefix}org.apache.hadoop.hive.conf.</shadedPattern> </relocation> - <relocation> - <pattern>org.apache.hive.service.</pattern> - <shadedPattern>${utilities.bundle.hive.shade.prefix}org.apache.hive.service.</shadedPattern> - </relocation> <relocation> <pattern>org.apache.hadoop.hive.service.</pattern> <shadedPattern>${utilities.bundle.hive.shade.prefix}org.apache.hadoop.hive.service.</shadedPattern> @@ -344,116 +322,27 @@ </dependency> <dependency> <groupId>org.apache.hudi</groupId> - <artifactId>hudi-client-common</artifactId> - <version>${project.version}</version> - </dependency> - <dependency> - <groupId>org.apache.hudi</groupId> - <artifactId>hudi-spark-client</artifactId> - <version>${project.version}</version> - </dependency> - <dependency> - <groupId>org.apache.hudi</groupId> - <artifactId>hudi-hive-sync</artifactId> + <artifactId>hudi-utilities_${scala.binary.version}</artifactId> <version>${project.version}</version> <exclusions> <exclusion> - <groupId>javax.servlet</groupId> - <artifactId>servlet-api</artifactId> + <groupId>org.apache.hudi</groupId> + <artifactId>hudi-spark-common_${scala.binary.version}</artifactId> + </exclusion> + <exclusion> + <groupId>org.apache.hudi</groupId> + <artifactId>hudi-spark_${scala.binary.version}</artifactId> + </exclusion> + <exclusion> + <groupId>org.apache.hudi</groupId> + <artifactId>${hudi.spark.module}_${scala.binary.version}</artifactId> + </exclusion> + <exclusion> + <groupId>org.apache.hudi</groupId> + <artifactId>${hudi.spark.common.module}</artifactId> </exclusion> </exclusions> </dependency> - <dependency> - <groupId>org.apache.hudi</groupId> - <artifactId>hudi-spark-common_${scala.binary.version}</artifactId> - <version>${project.version}</version> - <scope>provided</scope> - </dependency> - <dependency> - <groupId>org.apache.hudi</groupId> - <artifactId>hudi-spark_${scala.binary.version}</artifactId> - <version>${project.version}</version> - <scope>provided</scope> - </dependency> - <dependency> - <groupId>org.apache.hudi</groupId> - <artifactId>${hudi.spark.module}_${scala.binary.version}</artifactId> - <version>${project.version}</version> - <scope>provided</scope> - </dependency> - <dependency> - <groupId>org.apache.hudi</groupId> - <artifactId>${hudi.spark.common.module}</artifactId> - <version>${project.version}</version> - <scope>provided</scope> - </dependency> - <dependency> - <groupId>org.apache.hudi</groupId> - <artifactId>hudi-utilities_${scala.binary.version}</artifactId> - <version>${project.version}</version> - </dependency> - - <!-- Hive --> - <dependency> - <groupId>${hive.groupid}</groupId> - <artifactId>hive-service</artifactId> - <version>${hive.version}</version> - <scope>${utilities.bundle.hive.scope}</scope> - </dependency> - - <dependency> - <groupId>${hive.groupid}</groupId> - <artifactId>hive-service-rpc</artifactId> - <version>${hive.version}</version> - <scope>${utilities.bundle.hive.scope}</scope> - </dependency> - - <dependency> - <groupId>${hive.groupid}</groupId> - <artifactId>hive-jdbc</artifactId> - <version>${hive.version}</version> - <scope>${utilities.bundle.hive.scope}</scope> - </dependency> - - <dependency> - <groupId>${hive.groupid}</groupId> - <artifactId>hive-metastore</artifactId> - <version>${hive.version}</version> - <scope>${utilities.bundle.hive.scope}</scope> - </dependency> - - <dependency> - <groupId>${hive.groupid}</groupId> - <artifactId>hive-common</artifactId> - <version>${hive.version}</version> - <scope>${utilities.bundle.hive.scope}</scope> - </dependency> - - <dependency> - <groupId>org.apache.htrace</groupId> - <artifactId>htrace-core</artifactId> - <version>${htrace.version}</version> - <scope>compile</scope> - </dependency> - - <!-- zookeeper --> - <dependency> - <groupId>org.apache.curator</groupId> - <artifactId>curator-framework</artifactId> - <version>${zk-curator.version}</version> - </dependency> - - <dependency> - <groupId>org.apache.curator</groupId> - <artifactId>curator-client</artifactId> - <version>${zk-curator.version}</version> - </dependency> - - <dependency> - <groupId>org.apache.curator</groupId> - <artifactId>curator-recipes</artifactId> - <version>${zk-curator.version}</version> - </dependency> </dependencies> <profiles> diff --git a/pom.xml b/pom.xml index d898d34d35..1188ec620a 100644 --- a/pom.xml +++ b/pom.xml @@ -99,6 +99,7 @@ <pulsar.version>2.8.1</pulsar.version> <confluent.version>5.3.4</confluent.version> <glassfish.version>2.17</glassfish.version> + <glassfish.el.version>3.0.1-b12</glassfish.el.version> <parquet.version>1.10.1</parquet.version> <junit.jupiter.version>5.7.0-M1</junit.jupiter.version> <junit.vintage.version>5.7.0-M1</junit.vintage.version> @@ -556,6 +557,12 @@ <artifactId>jersey-container-servlet-core</artifactId> <version>${glassfish.version}</version> </dependency> + <dependency> + <groupId>org.glassfish</groupId> + <artifactId>javax.el</artifactId> + <version>${glassfish.el.version}</version> + <scope>provided</scope> + </dependency> <!-- Avro --> <dependency>