[ https://issues.apache.org/jira/browse/HUDI-1656?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
Fredrick jose antony cruz updated HUDI-1656: -------------------------------------------- Description: spark-submit --jars /u/users/svcordrdats/order_hudi_poc/hudi-support-jars/org.apache.avro_avro-1.8.2.jar,/u/users/svcordrdats/order_hudi_poc/hudi-support-jars/spark-avro_2.11-2.4.4.jar,/u/users/svcordrdats/order_hudi_poc/hudi-support-jars/hudi-spark-bundle_2.11-0.7.0.jar --master yarn --deploy-mode cluster --num-executors 50 --executor-cores 4 --executor-memory 32g --driver-memory=24g --queue=default --conf spark.serializer=org.apache.spark.serializer.KryoSerializer --conf spark.driver.extraClassPath=org.apache.avro_avro-1.8.2.jar:spark-avro_2.11-2.4.4.jar --conf spark.executor.extraClassPath=org.apache.avro_avro-1.8.2.jar:spark-avro_2.11-2.4.4.jar:hudi-spark-bundle_2.11-0.7.0.jar --conf spark.memory.fraction=0.2 --driver-java-options "-XX:NewSize=1g -XX:SurvivorRatio=2 -XX:+UseCompressedOops -XX:+UseConcMarkSweepGC -XX:+UseParNewGC -XX:CMSInitiatingOccupancyFraction=70 -XX:+PrintGCDetails -XX:+PrintGCTimeStamps -XX:+PrintGCDateStamps -XX:+PrintGCApplicationStoppedTime -XX:+PrintGCApplicationConcurrentTime -XX:+PrintTenuringDistribution -XX:+HeapDumpOnOutOfMemoryError -XX:HeapDumpPath=/tmp/hoodie-heapdump.hprof" --files /usr/hdp/current/spark2-client/conf/hive-site.xml --class com.walmart.gis.order.workflows.WorkflowController lib/orders-poc-1.0.41-SNAPSHOT-shaded.jar workflow="stgStsWorkflow" runmode="global" we are running on GCS cluster with 3 TB, 29 node cluster 870 v. core. pom <jdk-version>1.8</jdk-version> <scala.version>2.11.12</scala.version> <spark.version>2.3.0</spark.version> <avro-version>1.8.2</avro-version> <spark-avro.version>2.4.4</spark-avro.version> <hudi-version>0.7.0</hudi-version> <typesafe.config.version>1.4.0</typesafe.config.version> <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> <maven.compiler.target>1.8</maven.compiler.target> <maven.compiler.source>1.8</maven.compiler.source> stsDailyDf.write.format("org.apache.hudi") .option("hoodie.cleaner.commits.retained", 2) .option("hoodie.copyonwrite.record.size.estimate", 70) .option("hoodie.parquet.small.file.limit", 100000000) .option("hoodie.parquet.max.file.size", 128000000) .option("hoodie.index.bloom.num_entries", 1800000) .option("hoodie.bloom.index.filter.type", "DYNAMIC_V0") .option("hoodie.bloom.index.filter.dynamic.max.entries", 2500000) .option("hoodie.datasource.write.operation", "upsert") .option("hoodie.datasource.write.storage.type", "COPY_ON_WRITE") .option(DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY, "sales_order_sts_line_key") .option(DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY, "REL_STS_DT") .option(DataSourceWriteOptions.PRECOMBINE_FIELD_OPT_KEY, "src_upd_ts") .option(HoodieWriteConfig.TABLE_NAME, tableName.toString) .option("hoodie.bloom.index.bucketized.checking", "false") .mode(SaveMode.Append) .save(tablePath.toString) > Loading history data to new hudi table taking longer time > --------------------------------------------------------- > > Key: HUDI-1656 > URL: https://issues.apache.org/jira/browse/HUDI-1656 > Project: Apache Hudi > Issue Type: Improvement > Components: newbie > Reporter: Fredrick jose antony cruz > Priority: Major > Fix For: 0.7.0 > > > spark-submit --jars > /u/users/svcordrdats/order_hudi_poc/hudi-support-jars/org.apache.avro_avro-1.8.2.jar,/u/users/svcordrdats/order_hudi_poc/hudi-support-jars/spark-avro_2.11-2.4.4.jar,/u/users/svcordrdats/order_hudi_poc/hudi-support-jars/hudi-spark-bundle_2.11-0.7.0.jar > --master yarn --deploy-mode cluster --num-executors 50 --executor-cores 4 > --executor-memory 32g --driver-memory=24g --queue=default --conf > spark.serializer=org.apache.spark.serializer.KryoSerializer --conf > spark.driver.extraClassPath=org.apache.avro_avro-1.8.2.jar:spark-avro_2.11-2.4.4.jar > --conf > spark.executor.extraClassPath=org.apache.avro_avro-1.8.2.jar:spark-avro_2.11-2.4.4.jar:hudi-spark-bundle_2.11-0.7.0.jar > --conf spark.memory.fraction=0.2 --driver-java-options "-XX:NewSize=1g > -XX:SurvivorRatio=2 -XX:+UseCompressedOops -XX:+UseConcMarkSweepGC > -XX:+UseParNewGC -XX:CMSInitiatingOccupancyFraction=70 -XX:+PrintGCDetails > -XX:+PrintGCTimeStamps -XX:+PrintGCDateStamps > -XX:+PrintGCApplicationStoppedTime -XX:+PrintGCApplicationConcurrentTime > -XX:+PrintTenuringDistribution -XX:+HeapDumpOnOutOfMemoryError > -XX:HeapDumpPath=/tmp/hoodie-heapdump.hprof" --files > /usr/hdp/current/spark2-client/conf/hive-site.xml --class > com.walmart.gis.order.workflows.WorkflowController > lib/orders-poc-1.0.41-SNAPSHOT-shaded.jar workflow="stgStsWorkflow" > runmode="global" > we are running on GCS cluster with 3 TB, 29 node cluster 870 v. core. > pom > <jdk-version>1.8</jdk-version> > <scala.version>2.11.12</scala.version> > <spark.version>2.3.0</spark.version> > <avro-version>1.8.2</avro-version> > <spark-avro.version>2.4.4</spark-avro.version> > <hudi-version>0.7.0</hudi-version> > <typesafe.config.version>1.4.0</typesafe.config.version> > <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> > <maven.compiler.target>1.8</maven.compiler.target> > <maven.compiler.source>1.8</maven.compiler.source> > stsDailyDf.write.format("org.apache.hudi") > .option("hoodie.cleaner.commits.retained", 2) > .option("hoodie.copyonwrite.record.size.estimate", 70) > .option("hoodie.parquet.small.file.limit", 100000000) > .option("hoodie.parquet.max.file.size", 128000000) > .option("hoodie.index.bloom.num_entries", 1800000) > .option("hoodie.bloom.index.filter.type", "DYNAMIC_V0") > .option("hoodie.bloom.index.filter.dynamic.max.entries", 2500000) > .option("hoodie.datasource.write.operation", "upsert") > .option("hoodie.datasource.write.storage.type", "COPY_ON_WRITE") > .option(DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY, > "sales_order_sts_line_key") > .option(DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY, > "REL_STS_DT") > .option(DataSourceWriteOptions.PRECOMBINE_FIELD_OPT_KEY, "src_upd_ts") > .option(HoodieWriteConfig.TABLE_NAME, tableName.toString) > .option("hoodie.bloom.index.bucketized.checking", "false") > .mode(SaveMode.Append) > .save(tablePath.toString) -- This message was sent by Atlassian Jira (v8.3.4#803005)