Build failed in Jenkins: hudi-snapshot-deployment-0.5 #360

2020-08-04 Thread Apache Jenkins Server
See 


Changes:


--
[...truncated 2.45 KB...]

/home/jenkins/tools/maven/apache-maven-3.5.4/conf/logging:
simplelogger.properties

/home/jenkins/tools/maven/apache-maven-3.5.4/lib:
aopalliance-1.0.jar
cdi-api-1.0.jar
cdi-api.license
commons-cli-1.4.jar
commons-cli.license
commons-io-2.5.jar
commons-io.license
commons-lang3-3.5.jar
commons-lang3.license
ext
guava-20.0.jar
guice-4.2.0-no_aop.jar
jansi-1.17.1.jar
jansi-native
javax.inject-1.jar
jcl-over-slf4j-1.7.25.jar
jcl-over-slf4j.license
jsr250-api-1.0.jar
jsr250-api.license
maven-artifact-3.5.4.jar
maven-artifact.license
maven-builder-support-3.5.4.jar
maven-builder-support.license
maven-compat-3.5.4.jar
maven-compat.license
maven-core-3.5.4.jar
maven-core.license
maven-embedder-3.5.4.jar
maven-embedder.license
maven-model-3.5.4.jar
maven-model-builder-3.5.4.jar
maven-model-builder.license
maven-model.license
maven-plugin-api-3.5.4.jar
maven-plugin-api.license
maven-repository-metadata-3.5.4.jar
maven-repository-metadata.license
maven-resolver-api-1.1.1.jar
maven-resolver-api.license
maven-resolver-connector-basic-1.1.1.jar
maven-resolver-connector-basic.license
maven-resolver-impl-1.1.1.jar
maven-resolver-impl.license
maven-resolver-provider-3.5.4.jar
maven-resolver-provider.license
maven-resolver-spi-1.1.1.jar
maven-resolver-spi.license
maven-resolver-transport-wagon-1.1.1.jar
maven-resolver-transport-wagon.license
maven-resolver-util-1.1.1.jar
maven-resolver-util.license
maven-settings-3.5.4.jar
maven-settings-builder-3.5.4.jar
maven-settings-builder.license
maven-settings.license
maven-shared-utils-3.2.1.jar
maven-shared-utils.license
maven-slf4j-provider-3.5.4.jar
maven-slf4j-provider.license
org.eclipse.sisu.inject-0.3.3.jar
org.eclipse.sisu.inject.license
org.eclipse.sisu.plexus-0.3.3.jar
org.eclipse.sisu.plexus.license
plexus-cipher-1.7.jar
plexus-cipher.license
plexus-component-annotations-1.7.1.jar
plexus-component-annotations.license
plexus-interpolation-1.24.jar
plexus-interpolation.license
plexus-sec-dispatcher-1.4.jar
plexus-sec-dispatcher.license
plexus-utils-3.1.0.jar
plexus-utils.license
slf4j-api-1.7.25.jar
slf4j-api.license
wagon-file-3.1.0.jar
wagon-file.license
wagon-http-3.1.0-shaded.jar
wagon-http.license
wagon-provider-api-3.1.0.jar
wagon-provider-api.license

/home/jenkins/tools/maven/apache-maven-3.5.4/lib/ext:
README.txt

/home/jenkins/tools/maven/apache-maven-3.5.4/lib/jansi-native:
freebsd32
freebsd64
linux32
linux64
osx
README.txt
windows32
windows64

/home/jenkins/tools/maven/apache-maven-3.5.4/lib/jansi-native/freebsd32:
libjansi.so

/home/jenkins/tools/maven/apache-maven-3.5.4/lib/jansi-native/freebsd64:
libjansi.so

/home/jenkins/tools/maven/apache-maven-3.5.4/lib/jansi-native/linux32:
libjansi.so

/home/jenkins/tools/maven/apache-maven-3.5.4/lib/jansi-native/linux64:
libjansi.so

/home/jenkins/tools/maven/apache-maven-3.5.4/lib/jansi-native/osx:
libjansi.jnilib

/home/jenkins/tools/maven/apache-maven-3.5.4/lib/jansi-native/windows32:
jansi.dll

/home/jenkins/tools/maven/apache-maven-3.5.4/lib/jansi-native/windows64:
jansi.dll
Finished /home/jenkins/tools/maven/apache-maven-3.5.4 Directory Listing :
Detected current version as: 
'HUDI_home=
0.6.0-SNAPSHOT'
[INFO] Scanning for projects...
[WARNING] 
[WARNING] Some problems were encountered while building the effective model for 
org.apache.hudi:hudi-spark_2.11:jar:0.6.0-SNAPSHOT
[WARNING] 'artifactId' contains an expression but should be a constant. @ 
org.apache.hudi:hudi-spark_${scala.binary.version}:[unknown-version], 

 line 26, column 15
[WARNING] 
[WARNING] Some problems were encountered while building the effective model for 
org.apache.hudi:hudi-utilities_2.11:jar:0.6.0-SNAPSHOT
[WARNING] 'artifactId' contains an expression but should be a constant. @ 
org.apache.hudi:hudi-utilities_${scala.binary.version}:[unknown-version], 

 line 26, column 15
[WARNING] 
[WARNING] Some problems were encountered while building the effective model for 
org.apache.hudi:hudi-spark-bundle_2.11:jar:0.6.0-SNAPSHOT
[WARNING] 'artifactId' contains an expression but should be a constant. @ 
org.apache.hudi:hudi-spark-bundle_${scala.binary.version}:[unknown-version], 

 line 26, column 15
[WARNING] 
[WARNING] Some problems were encountered while building the effective model for 
org.apache.hudi:hudi-utilities-bundle_2.11:jar:0.6.0-SNAPSHOT
[WARNING] 'artifactId' contains an expression but should be a constant. @ 
org.apache.hudi:hudi-utilities-bundle_${scala.binary.version}:[unknown-version],
 

[jira] [Resolved] (HUDI-1140) JCommander not passing command line arguments with comma separated values.

2020-08-04 Thread Sreeram Ramji (Jira)


 [ 
https://issues.apache.org/jira/browse/HUDI-1140?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Sreeram Ramji resolved HUDI-1140.
-
Resolution: Fixed

> JCommander not passing command line arguments with comma separated values.
> --
>
> Key: HUDI-1140
> URL: https://issues.apache.org/jira/browse/HUDI-1140
> Project: Apache Hudi
>  Issue Type: Bug
>  Components: DeltaStreamer
>Reporter: Balaji Varadarajan
>Assignee: Sreeram Ramji
>Priority: Major
>  Labels: newbe, pull-request-available
> Fix For: 0.6.1
>
>
> Please see 
> [https://github.com/apache/hudi/issues/1586|https://github.com/apache/hudi/issues/1586#issuecomment-666419674]
>  for context.



--
This message was sent by Atlassian Jira
(v8.3.4#803005)


[GitHub] [hudi] prashantwason commented on pull request #1915: [HUDI-1149] Added a console metrics reporter and associated unit tests.

2020-08-04 Thread GitBox


prashantwason commented on pull request #1915:
URL: https://github.com/apache/hudi/pull/1915#issuecomment-668977621


   @n3nash Updated and ready for review.



This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [hudi] prashantwason commented on a change in pull request #1915: [HUDI-1149] Added a console metrics reporter and associated unit tests.

2020-08-04 Thread GitBox


prashantwason commented on a change in pull request #1915:
URL: https://github.com/apache/hudi/pull/1915#discussion_r465470189



##
File path: 
hudi-client/src/main/java/org/apache/hudi/metrics/ConsoleMetricsReporter.java
##
@@ -0,0 +1,72 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.metrics;
+
+import java.io.Closeable;
+import java.util.concurrent.TimeUnit;
+
+import org.apache.log4j.LogManager;
+import org.apache.log4j.Logger;
+
+import com.codahale.metrics.ConsoleReporter;
+import com.codahale.metrics.MetricFilter;
+import com.codahale.metrics.MetricRegistry;
+
+/**
+ * Used for testing.

Review comment:
   Updated to a more useful comment.





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [hudi] bvaradar merged pull request #1898: [HUDI-1140] Fix Jcommander issue for --hoodie-conf in DeltaStreamer

2020-08-04 Thread GitBox


bvaradar merged pull request #1898:
URL: https://github.com/apache/hudi/pull/1898


   



This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[hudi] branch master updated (d3711a2 -> 217a841)

2020-08-04 Thread vbalaji
This is an automated email from the ASF dual-hosted git repository.

vbalaji pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/hudi.git.


from d3711a2  [HUDI-525] lack of insert info in delta_commit inflight
 add 217a841  [HUDI-1140] Fix Jcommander issue for --hoodie-conf in 
DeltaStreamer (#1898)

No new revisions were added by this update.

Summary of changes:
 .../apache/hudi/utilities/HDFSParquetImporter.java |   3 +-
 .../org/apache/hudi/utilities/HoodieCleaner.java   |   3 +-
 .../org/apache/hudi/utilities/HoodieCompactor.java |   3 +-
 .../apache/hudi/utilities/IdentitySplitter.java|  16 +--
 .../deltastreamer/HoodieDeltaStreamer.java | 104 +++-
 .../HoodieMultiTableDeltaStreamer.java |   4 +-
 .../functional/TestHoodieDeltaStreamer.java| 107 +
 7 files changed, 223 insertions(+), 17 deletions(-)
 copy 
hudi-hive-sync/src/main/java/org/apache/hudi/hive/NonPartitionedExtractor.java 
=> hudi-utilities/src/main/java/org/apache/hudi/utilities/IdentitySplitter.java 
(70%)



[GitHub] [hudi] vinothchandar commented on a change in pull request #1810: [HUDI-875] Abstract hudi-sync-common, and support hudi-hive-sync

2020-08-04 Thread GitBox


vinothchandar commented on a change in pull request #1810:
URL: https://github.com/apache/hudi/pull/1810#discussion_r465468482



##
File path: 
hudi-utilities/src/main/java/org/apache/hudi/utilities/deltastreamer/HoodieDeltaStreamerMetrics.java
##
@@ -67,10 +77,15 @@ String getMetricsName(String action, String metric) {
 return config == null ? null : String.format("%s.%s.%s", tableName, 
action, metric);
   }
 
-  public void updateDeltaStreamerMetrics(long durationInNs, long hiveSyncNs) {
+  public void updateDeltaStreamerMetrics(long durationInNs) {
 if (config.isMetricsOn()) {
   Metrics.registerGauge(getMetricsName("deltastreamer", "duration"), 
getDurationInMs(durationInNs));
-  Metrics.registerGauge(getMetricsName("deltastreamer", 
"hiveSyncDuration"), getDurationInMs(hiveSyncNs));
+}
+  }
+
+  public void updateDeltaStreamerMetaSyncMetrics(String syncClassName, long 
syncNs) {

Review comment:
   do we need the entire class name here? Would that not make for a long 
metric name? :) 
   
   May be have a `getShortName()` method for the AbstractSyncTool class and 
return "hive" and "dla" from them? 





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [hudi] vinothchandar commented on pull request #1810: [HUDI-875] Abstract hudi-sync-common, and support hudi-hive-sync

2020-08-04 Thread GitBox


vinothchandar commented on pull request #1810:
URL: https://github.com/apache/hudi/pull/1810#issuecomment-668974757


   @lw309637554 is this ready for a final review? 



This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [hudi] umehrot2 commented on pull request #1702: [HUDI-426] Bootstrap datasource integration

2020-08-04 Thread GitBox


umehrot2 commented on pull request #1702:
URL: https://github.com/apache/hudi/pull/1702#issuecomment-668972215


   @vinothchandar the tests are passing, so its ready for review from my side.



This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[jira] [Reopened] (HUDI-427) Implement CLI support for performing bootstrap

2020-08-04 Thread Udit Mehrotra (Jira)


 [ 
https://issues.apache.org/jira/browse/HUDI-427?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Udit Mehrotra reopened HUDI-427:


> Implement CLI support for performing bootstrap
> --
>
> Key: HUDI-427
> URL: https://issues.apache.org/jira/browse/HUDI-427
> Project: Apache Hudi
>  Issue Type: Sub-task
>  Components: CLI
>Reporter: Balaji Varadarajan
>Assignee: Wenning Ding
>Priority: Blocker
>  Labels: pull-request-available
> Fix For: 0.6.0
>
>  Time Spent: 168h
>  Remaining Estimate: 0h
>
> Need CLI to perform bootstrap as described in 
> [https://cwiki.apache.org/confluence/display/HUDI/RFC+-+12+%3A+Efficient+Migration+of+Large+Parquet+Tables+to+Apache+Hudi]



--
This message was sent by Atlassian Jira
(v8.3.4#803005)


[jira] [Updated] (HUDI-1098) Marker file finalizing may block on a data file that was never written

2020-08-04 Thread Vinoth Chandar (Jira)


 [ 
https://issues.apache.org/jira/browse/HUDI-1098?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Vinoth Chandar updated HUDI-1098:
-
Status: Patch Available  (was: In Progress)

> Marker file finalizing may block on a data file that was never written
> --
>
> Key: HUDI-1098
> URL: https://issues.apache.org/jira/browse/HUDI-1098
> Project: Apache Hudi
>  Issue Type: Bug
>  Components: Writer Core
>Reporter: Vinoth Chandar
>Assignee: sivabalan narayanan
>Priority: Blocker
>  Labels: pull-request-available
> Fix For: 0.6.0
>
>
> {code:java}
> // Ensure all files in delete list is actually present. This is mandatory for 
> an eventually consistent FS. // Otherwise, we may miss deleting such files. 
> If files are not found even after retries, fail the commit 
> if (consistencyCheckEnabled) { 
>   // This will either ensure all files to be deleted are present. 
> waitForAllFiles(jsc, groupByPartition, FileVisibility.APPEAR); 
> }
> {code}
> We need to handle the case where marker file was created, but we crashed 
> before the data file was created. 



--
This message was sent by Atlassian Jira
(v8.3.4#803005)


[jira] [Updated] (HUDI-1091) Handle empty input batch gracefully in ParquetDFSSource

2020-08-04 Thread Vinoth Chandar (Jira)


 [ 
https://issues.apache.org/jira/browse/HUDI-1091?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Vinoth Chandar updated HUDI-1091:
-
Priority: Major  (was: Blocker)

> Handle empty input batch gracefully in ParquetDFSSource
> ---
>
> Key: HUDI-1091
> URL: https://issues.apache.org/jira/browse/HUDI-1091
> Project: Apache Hudi
>  Issue Type: Bug
>  Components: DeltaStreamer
>Reporter: Balaji Varadarajan
>Assignee: Balaji Varadarajan
>Priority: Major
> Fix For: 0.6.0
>
>
> [https://github.com/apache/hudi/issues/1813]
>  Looking at 0.5.3, it is possible the below exception can happen when running 
> in standalone mode and the next batch to write is empty.
> ERROR HoodieDeltaStreamer: Got error running delta sync once. Shutting down 
> org.apache.hudi.exception.HoodieException: Please provide a valid schema 
> provider class! at 
> org.apache.hudi.utilities.sources.InputBatch.getSchemaProvider(InputBatch.java:53)
>  at 
> org.apache.hudi.utilities.deltastreamer.DeltaSync.readFromSource(DeltaSync.java:312)
>  at 
> org.apache.hudi.utilities.deltastreamer.DeltaSync.syncOnce(DeltaSync.java:226)
>  at 
> org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer.sync(HoodieDeltaStreamer.java:121)
>  at 
> org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer.main(HoodieDeltaStreamer.java:294)
>  at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) at 
> sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) 
> at 
> sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
>  at java.lang.reflect.Method.invoke(Method.java:498) at 
> org.apache.spark.deploy.JavaMainApplication.start(SparkApplication.scala:52) 
> at 
> org.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:853)
>  at org.apache.spark.deploy.SparkSubmit.doRunMain$1(SparkSubmit.scala:161) at 
> org.apache.spark.deploy.SparkSubmit.submit(SparkSubmit.scala:184) at 
> org.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:86) at 
> org.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:928) 
> at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:937) at 
> org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)



--
This message was sent by Atlassian Jira
(v8.3.4#803005)


[jira] [Updated] (HUDI-841) Abstract common meta sync module support multiple meta service

2020-08-04 Thread Vinoth Chandar (Jira)


 [ 
https://issues.apache.org/jira/browse/HUDI-841?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Vinoth Chandar updated HUDI-841:

Status: In Progress  (was: Open)

> Abstract common meta sync module support multiple meta service
> --
>
> Key: HUDI-841
> URL: https://issues.apache.org/jira/browse/HUDI-841
> Project: Apache Hudi
>  Issue Type: Improvement
>  Components: Hive Integration
>Reporter: liwei
>Assignee: liwei
>Priority: Blocker
> Fix For: 0.6.0
>
>
> Currently Hudi only supports sync dataset metadata to Hive through hive jdbc 
> and IMetaStoreClient. When you need to sync to other frameworks, such as aws 
> glue, aliyun DataLake analytics, etc. You need to copy a lot of code from 
> HoodieHiveClient, which creates a lot of redundant code. So need to redesign 
> the hudi-hive-sync module to support other frameworks and reuse current code 
> as much as possible. Only the interface is provided by Hudi, and the 
> implement is customized by different services as hive 、aws glue、aliyun 
> DataLake analytics.



--
This message was sent by Atlassian Jira
(v8.3.4#803005)


[jira] [Updated] (HUDI-841) Abstract common meta sync module support multiple meta service

2020-08-04 Thread Vinoth Chandar (Jira)


 [ 
https://issues.apache.org/jira/browse/HUDI-841?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Vinoth Chandar updated HUDI-841:

Status: Patch Available  (was: In Progress)

> Abstract common meta sync module support multiple meta service
> --
>
> Key: HUDI-841
> URL: https://issues.apache.org/jira/browse/HUDI-841
> Project: Apache Hudi
>  Issue Type: Improvement
>  Components: Hive Integration
>Reporter: liwei
>Assignee: liwei
>Priority: Blocker
> Fix For: 0.6.0
>
>
> Currently Hudi only supports sync dataset metadata to Hive through hive jdbc 
> and IMetaStoreClient. When you need to sync to other frameworks, such as aws 
> glue, aliyun DataLake analytics, etc. You need to copy a lot of code from 
> HoodieHiveClient, which creates a lot of redundant code. So need to redesign 
> the hudi-hive-sync module to support other frameworks and reuse current code 
> as much as possible. Only the interface is provided by Hudi, and the 
> implement is customized by different services as hive 、aws glue、aliyun 
> DataLake analytics.



--
This message was sent by Atlassian Jira
(v8.3.4#803005)


[jira] [Updated] (HUDI-841) Abstract common meta sync module support multiple meta service

2020-08-04 Thread Vinoth Chandar (Jira)


 [ 
https://issues.apache.org/jira/browse/HUDI-841?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Vinoth Chandar updated HUDI-841:

Status: Open  (was: New)

> Abstract common meta sync module support multiple meta service
> --
>
> Key: HUDI-841
> URL: https://issues.apache.org/jira/browse/HUDI-841
> Project: Apache Hudi
>  Issue Type: Improvement
>  Components: Hive Integration
>Reporter: liwei
>Assignee: liwei
>Priority: Blocker
> Fix For: 0.6.0
>
>
> Currently Hudi only supports sync dataset metadata to Hive through hive jdbc 
> and IMetaStoreClient. When you need to sync to other frameworks, such as aws 
> glue, aliyun DataLake analytics, etc. You need to copy a lot of code from 
> HoodieHiveClient, which creates a lot of redundant code. So need to redesign 
> the hudi-hive-sync module to support other frameworks and reuse current code 
> as much as possible. Only the interface is provided by Hudi, and the 
> implement is customized by different services as hive 、aws glue、aliyun 
> DataLake analytics.



--
This message was sent by Atlassian Jira
(v8.3.4#803005)


[jira] [Assigned] (HUDI-1014) Design and Implement upgrade-downgrade infrastrucutre

2020-08-04 Thread Vinoth Chandar (Jira)


 [ 
https://issues.apache.org/jira/browse/HUDI-1014?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Vinoth Chandar reassigned HUDI-1014:


Assignee: Vinoth Chandar  (was: sivabalan narayanan)

> Design and Implement upgrade-downgrade infrastrucutre
> -
>
> Key: HUDI-1014
> URL: https://issues.apache.org/jira/browse/HUDI-1014
> Project: Apache Hudi
>  Issue Type: Improvement
>  Components: Common Core, Writer Core
>Reporter: Vinoth Chandar
>Assignee: Vinoth Chandar
>Priority: Blocker
>  Labels: pull-request-available
> Fix For: 0.6.0
>
>




--
This message was sent by Atlassian Jira
(v8.3.4#803005)


[jira] [Updated] (HUDI-242) [RFC-12] Support Efficient bootstrap of large parquet datasets to Hudi

2020-08-04 Thread Vinoth Chandar (Jira)


 [ 
https://issues.apache.org/jira/browse/HUDI-242?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Vinoth Chandar updated HUDI-242:

Summary: [RFC-12] Support Efficient bootstrap of large parquet datasets to 
Hudi  (was: Support Efficient bootstrap of large parquet datasets to Hudi)

> [RFC-12] Support Efficient bootstrap of large parquet datasets to Hudi
> --
>
> Key: HUDI-242
> URL: https://issues.apache.org/jira/browse/HUDI-242
> Project: Apache Hudi
>  Issue Type: Improvement
>  Components: Usability
>Reporter: Balaji Varadarajan
>Assignee: Vinoth Chandar
>Priority: Major
>  Labels: pull-request-available
> Fix For: 0.6.0
>
>
>  Support Efficient bootstrap of large parquet tables



--
This message was sent by Atlassian Jira
(v8.3.4#803005)


[jira] [Reopened] (HUDI-426) Implement Spark DataSource Support for querying bootstrapped tables

2020-08-04 Thread Vinoth Chandar (Jira)


 [ 
https://issues.apache.org/jira/browse/HUDI-426?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Vinoth Chandar reopened HUDI-426:
-

> Implement Spark DataSource Support for querying bootstrapped tables
> ---
>
> Key: HUDI-426
> URL: https://issues.apache.org/jira/browse/HUDI-426
> Project: Apache Hudi
>  Issue Type: Sub-task
>  Components: Spark Integration
>Reporter: Balaji Varadarajan
>Assignee: Udit Mehrotra
>Priority: Blocker
>  Labels: pull-request-available
> Fix For: 0.6.0
>
>  Time Spent: 10m
>  Remaining Estimate: 0h
>
> We need ability in SparkDataSource to query COW table which is bootstrapped 
> as per 
> [https://cwiki.apache.org/confluence/display/HUDI/RFC+-+12+:+Efficient+Migration+of+Large+Parquet+Tables+to+Apache+Hudi#RFC-12:EfficientMigrationofLargeParquetTablestoApacheHudi-BootstrapIndex:]
>  
> Current implementation delegates to Parquet DataSource but this wont work as 
> we need ability to stitch the columns externally.
>  



--
This message was sent by Atlassian Jira
(v8.3.4#803005)


[jira] [Updated] (HUDI-242) Support Efficient bootstrap of large parquet datasets to Hudi

2020-08-04 Thread Vinoth Chandar (Jira)


 [ 
https://issues.apache.org/jira/browse/HUDI-242?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Vinoth Chandar updated HUDI-242:

Priority: Major  (was: Blocker)

> Support Efficient bootstrap of large parquet datasets to Hudi
> -
>
> Key: HUDI-242
> URL: https://issues.apache.org/jira/browse/HUDI-242
> Project: Apache Hudi
>  Issue Type: Improvement
>  Components: Usability
>Reporter: Balaji Varadarajan
>Assignee: Vinoth Chandar
>Priority: Major
>  Labels: pull-request-available
> Fix For: 0.6.0
>
>
>  Support Efficient bootstrap of large parquet tables



--
This message was sent by Atlassian Jira
(v8.3.4#803005)


[jira] [Updated] (HUDI-900) Metadata Bootstrap Key Generator needs to handle complex keys correctly

2020-08-04 Thread Vinoth Chandar (Jira)


 [ 
https://issues.apache.org/jira/browse/HUDI-900?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Vinoth Chandar updated HUDI-900:

Status: Closed  (was: Patch Available)

> Metadata Bootstrap Key Generator needs to handle complex keys correctly
> ---
>
> Key: HUDI-900
> URL: https://issues.apache.org/jira/browse/HUDI-900
> Project: Apache Hudi
>  Issue Type: Sub-task
>  Components: Writer Core
>Reporter: Balaji Varadarajan
>Assignee: Balaji Varadarajan
>Priority: Blocker
> Fix For: 0.6.0
>
>  Time Spent: 24h
>  Remaining Estimate: 0h
>
> Look at ComplexKeyGenerator. Make sure MetadataBootstrap is of same format.



--
This message was sent by Atlassian Jira
(v8.3.4#803005)


[jira] [Updated] (HUDI-1054) Address performance issues with finalizing writes on S3

2020-08-04 Thread Vinoth Chandar (Jira)


 [ 
https://issues.apache.org/jira/browse/HUDI-1054?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Vinoth Chandar updated HUDI-1054:
-
Status: Closed  (was: Patch Available)

> Address performance issues with finalizing writes on S3
> ---
>
> Key: HUDI-1054
> URL: https://issues.apache.org/jira/browse/HUDI-1054
> Project: Apache Hudi
>  Issue Type: Sub-task
>  Components: bootstrap, Common Core, Performance
>Reporter: Udit Mehrotra
>Assignee: Udit Mehrotra
>Priority: Blocker
>  Labels: pull-request-available
> Fix For: 0.6.0
>
>
> I have identified 3 performance bottleneck in the 
> [finalizeWrite|https://github.com/apache/hudi/blob/master/hudi-client/src/main/java/org/apache/hudi/table/HoodieTable.java#L378]
>  function, that are manifesting and becoming more prominent with the new 
> bootstrap mechanism on S3:
>  * 
> [https://github.com/apache/hudi/blob/5e476733417c3f92ea97d3e5f9a5c8bc48246e99/hudi-client/src/main/java/org/apache/hudi/table/HoodieTable.java#L425]
>   is a serial operation performed at the driver and it can take a long time 
> when you have several partitions and large number of files.
>  * The invalid data paths are being stored in a List instead of Set and as a 
> result the following operation becomes N^2 taking significant time to compute 
> at the driver: 
> [https://github.com/apache/hudi/blob/5e476733417c3f92ea97d3e5f9a5c8bc48246e99/hudi-client/src/main/java/org/apache/hudi/table/HoodieTable.java#L429]
>  * 
> [https://github.com/apache/hudi/blob/5e476733417c3f92ea97d3e5f9a5c8bc48246e99/hudi-client/src/main/java/org/apache/hudi/table/HoodieTable.java#L473]
>  does a recursive delete of the marker directory at the driver. This is again 
> extremely expensive when you have large number of partitions and files.
>  
> Upon testing with a 1 TB data set, having 8000 partitions and approximately 
> 19 files this whole process consumes *35 minutes*. There is scope to 
> address these performance issues with spark parallelization and using 
> appropriate data structures.



--
This message was sent by Atlassian Jira
(v8.3.4#803005)


[jira] [Updated] (HUDI-422) Cleanup bootstrap code and create write APIs for supporting bootstrap

2020-08-04 Thread Vinoth Chandar (Jira)


 [ 
https://issues.apache.org/jira/browse/HUDI-422?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Vinoth Chandar updated HUDI-422:

Status: Closed  (was: Patch Available)

> Cleanup bootstrap code and create write APIs for supporting bootstrap 
> --
>
> Key: HUDI-422
> URL: https://issues.apache.org/jira/browse/HUDI-422
> Project: Apache Hudi
>  Issue Type: Sub-task
>  Components: Writer Core
>Reporter: Balaji Varadarajan
>Assignee: Balaji Varadarajan
>Priority: Blocker
> Fix For: 0.6.0
>
>  Time Spent: 96h
>  Remaining Estimate: 0h
>
> Once refactor for HoodieWriteClient is done, we can cleanup and introduce 
> HoodieBootstrapClient as a separate PR.



--
This message was sent by Atlassian Jira
(v8.3.4#803005)


[jira] [Updated] (HUDI-427) Implement CLI support for performing bootstrap

2020-08-04 Thread Vinoth Chandar (Jira)


 [ 
https://issues.apache.org/jira/browse/HUDI-427?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Vinoth Chandar updated HUDI-427:

Status: Closed  (was: Patch Available)

> Implement CLI support for performing bootstrap
> --
>
> Key: HUDI-427
> URL: https://issues.apache.org/jira/browse/HUDI-427
> Project: Apache Hudi
>  Issue Type: Sub-task
>  Components: CLI
>Reporter: Balaji Varadarajan
>Assignee: Wenning Ding
>Priority: Blocker
>  Labels: pull-request-available
> Fix For: 0.6.0
>
>  Time Spent: 168h
>  Remaining Estimate: 0h
>
> Need CLI to perform bootstrap as described in 
> [https://cwiki.apache.org/confluence/display/HUDI/RFC+-+12+%3A+Efficient+Migration+of+Large+Parquet+Tables+to+Apache+Hudi]



--
This message was sent by Atlassian Jira
(v8.3.4#803005)


[jira] [Updated] (HUDI-424) Implement Hive Query Side Integration for querying tables containing bootstrap file slices

2020-08-04 Thread Vinoth Chandar (Jira)


 [ 
https://issues.apache.org/jira/browse/HUDI-424?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Vinoth Chandar updated HUDI-424:

Status: Closed  (was: Patch Available)

> Implement Hive Query Side Integration for querying tables containing 
> bootstrap file slices
> --
>
> Key: HUDI-424
> URL: https://issues.apache.org/jira/browse/HUDI-424
> Project: Apache Hudi
>  Issue Type: Sub-task
>  Components: Hive Integration
>Reporter: Balaji Varadarajan
>Assignee: Balaji Varadarajan
>Priority: Blocker
> Fix For: 0.6.0
>
>  Time Spent: 336h
>  Remaining Estimate: 0h
>
> Support for Hive read-optimized and realtime queries 
>  
>  



--
This message was sent by Atlassian Jira
(v8.3.4#803005)


[jira] [Updated] (HUDI-806) Implement support for bootstrapping via Spark datasource API

2020-08-04 Thread Vinoth Chandar (Jira)


 [ 
https://issues.apache.org/jira/browse/HUDI-806?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Vinoth Chandar updated HUDI-806:

Status: Closed  (was: Patch Available)

> Implement support for bootstrapping via Spark datasource API
> 
>
> Key: HUDI-806
> URL: https://issues.apache.org/jira/browse/HUDI-806
> Project: Apache Hudi
>  Issue Type: Sub-task
>  Components: Spark Integration
>Reporter: Udit Mehrotra
>Assignee: Udit Mehrotra
>Priority: Blocker
> Fix For: 0.6.0
>
>  Time Spent: 336h
>  Remaining Estimate: 0h
>
> This Jira tracks the work required to perform bootstrapping through Spark 
> data source API.



--
This message was sent by Atlassian Jira
(v8.3.4#803005)


[jira] [Updated] (HUDI-425) Implement support for bootstrapping in HoodieDeltaStreamer

2020-08-04 Thread Vinoth Chandar (Jira)


 [ 
https://issues.apache.org/jira/browse/HUDI-425?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Vinoth Chandar updated HUDI-425:

Status: Closed  (was: Patch Available)

> Implement support for bootstrapping in HoodieDeltaStreamer
> --
>
> Key: HUDI-425
> URL: https://issues.apache.org/jira/browse/HUDI-425
> Project: Apache Hudi
>  Issue Type: Sub-task
>  Components: DeltaStreamer
>Reporter: Balaji Varadarajan
>Assignee: Balaji Varadarajan
>Priority: Blocker
>  Labels: help-wanted
> Fix For: 0.6.0
>
>  Time Spent: 168h
>  Remaining Estimate: 0h
>




--
This message was sent by Atlassian Jira
(v8.3.4#803005)


[jira] [Updated] (HUDI-418) Bootstrap Index - Implementation

2020-08-04 Thread Vinoth Chandar (Jira)


 [ 
https://issues.apache.org/jira/browse/HUDI-418?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Vinoth Chandar updated HUDI-418:

Status: Closed  (was: Patch Available)

> Bootstrap Index - Implementation
> 
>
> Key: HUDI-418
> URL: https://issues.apache.org/jira/browse/HUDI-418
> Project: Apache Hudi
>  Issue Type: Sub-task
>  Components: Common Core
>Reporter: Balaji Varadarajan
>Assignee: Balaji Varadarajan
>Priority: Blocker
>  Labels: pull-request-available
> Fix For: 0.6.0
>
>  Time Spent: 10m
>  Remaining Estimate: 0h
>
> An implementation for 
> [https://cwiki.apache.org/confluence/display/HUDI/RFC+-+12+:+Efficient+Migration+of+Large+Parquet+Tables+to+Apache+Hudi#RFC-12:EfficientMigrationofLargeParquetTablestoApacheHudi-BootstrapIndex:]
>  is present in 
> [https://github.com/bvaradar/hudi/blob/vb_bootstrap/hudi-common/src/main/java/org/apache/hudi/common/consolidated/CompositeMapFile.java]
>  
> We need to make it solid with unit-tests and cleanup. 
>  
>  



--
This message was sent by Atlassian Jira
(v8.3.4#803005)


[jira] [Updated] (HUDI-420) Automated end to end Integration Test

2020-08-04 Thread Vinoth Chandar (Jira)


 [ 
https://issues.apache.org/jira/browse/HUDI-420?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Vinoth Chandar updated HUDI-420:

Status: Closed  (was: Patch Available)

> Automated end to end Integration Test
> -
>
> Key: HUDI-420
> URL: https://issues.apache.org/jira/browse/HUDI-420
> Project: Apache Hudi
>  Issue Type: Sub-task
>  Components: Testing
>Reporter: Balaji Varadarajan
>Assignee: Balaji Varadarajan
>Priority: Blocker
> Fix For: 0.6.0
>
>  Time Spent: 72h
>  Remaining Estimate: 0h
>
> We need end to end test as part ITTestHoodieDemo to also include bootstrap 
> table cases.
> We can have a new table bootstrapped from the Hoodie table build in the demo 
> and ensure queries work and return same responses



--
This message was sent by Atlassian Jira
(v8.3.4#803005)


[jira] [Updated] (HUDI-426) Implement Spark DataSource Support for querying bootstrapped tables

2020-08-04 Thread Vinoth Chandar (Jira)


 [ 
https://issues.apache.org/jira/browse/HUDI-426?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Vinoth Chandar updated HUDI-426:

Status: Closed  (was: Patch Available)

> Implement Spark DataSource Support for querying bootstrapped tables
> ---
>
> Key: HUDI-426
> URL: https://issues.apache.org/jira/browse/HUDI-426
> Project: Apache Hudi
>  Issue Type: Sub-task
>  Components: Spark Integration
>Reporter: Balaji Varadarajan
>Assignee: Udit Mehrotra
>Priority: Blocker
>  Labels: pull-request-available
> Fix For: 0.6.0
>
>  Time Spent: 10m
>  Remaining Estimate: 0h
>
> We need ability in SparkDataSource to query COW table which is bootstrapped 
> as per 
> [https://cwiki.apache.org/confluence/display/HUDI/RFC+-+12+:+Efficient+Migration+of+Large+Parquet+Tables+to+Apache+Hudi#RFC-12:EfficientMigrationofLargeParquetTablestoApacheHudi-BootstrapIndex:]
>  
> Current implementation delegates to Parquet DataSource but this wont work as 
> we need ability to stitch the columns externally.
>  



--
This message was sent by Atlassian Jira
(v8.3.4#803005)


[jira] [Updated] (HUDI-423) Implement upsert functionality for handling updates to these bootstrap file slices

2020-08-04 Thread Vinoth Chandar (Jira)


 [ 
https://issues.apache.org/jira/browse/HUDI-423?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Vinoth Chandar updated HUDI-423:

Status: Closed  (was: Patch Available)

> Implement upsert functionality for handling updates to these bootstrap file 
> slices
> --
>
> Key: HUDI-423
> URL: https://issues.apache.org/jira/browse/HUDI-423
> Project: Apache Hudi
>  Issue Type: Sub-task
>  Components: Common Core, Writer Core
>Reporter: Balaji Varadarajan
>Assignee: Balaji Varadarajan
>Priority: Blocker
> Fix For: 0.6.0
>
>  Time Spent: 168h
>  Remaining Estimate: 0h
>
> Needs support to handle upsert of these file-slices. For MOR tables, also 
> need compaction support. 
>  



--
This message was sent by Atlassian Jira
(v8.3.4#803005)


[jira] [Updated] (HUDI-421) Cleanup bootstrap code and create PR for FileStystemView changes

2020-08-04 Thread Vinoth Chandar (Jira)


 [ 
https://issues.apache.org/jira/browse/HUDI-421?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Vinoth Chandar updated HUDI-421:

Status: Closed  (was: Patch Available)

> Cleanup bootstrap code and create PR for  FileStystemView changes
> -
>
> Key: HUDI-421
> URL: https://issues.apache.org/jira/browse/HUDI-421
> Project: Apache Hudi
>  Issue Type: Sub-task
>  Components: Common Core
>Reporter: Balaji Varadarajan
>Assignee: Balaji Varadarajan
>Priority: Blocker
> Fix For: 0.6.0
>
>  Time Spent: 240h
>  Remaining Estimate: 0h
>
> FileSystemView needs changes to identify and handle bootstrap file slices. 
> Code changes are present in 
> [https://github.com/bvaradar/hudi/tree/vb_bootstrap] Needs cleanup before 
> they are ready to become PR.



--
This message was sent by Atlassian Jira
(v8.3.4#803005)


[jira] [Updated] (HUDI-807) Spark DS Support for incremental queries for bootstrapped tables

2020-08-04 Thread Vinoth Chandar (Jira)


 [ 
https://issues.apache.org/jira/browse/HUDI-807?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Vinoth Chandar updated HUDI-807:

Status: Closed  (was: Patch Available)

> Spark DS Support for incremental queries for bootstrapped tables
> 
>
> Key: HUDI-807
> URL: https://issues.apache.org/jira/browse/HUDI-807
> Project: Apache Hudi
>  Issue Type: Sub-task
>Reporter: Udit Mehrotra
>Assignee: Udit Mehrotra
>Priority: Blocker
> Fix For: 0.6.0
>
>  Time Spent: 120h
>  Remaining Estimate: 0h
>
> Investigate and figure out the changes required in Spark integration code to 
> make incremental queries work seamlessly for bootstrapped tables.



--
This message was sent by Atlassian Jira
(v8.3.4#803005)


[GitHub] [hudi] lw309637554 commented on a change in pull request #1810: [HUDI-875] Abstract hudi-sync-common, and support hudi-hive-sync

2020-08-04 Thread GitBox


lw309637554 commented on a change in pull request #1810:
URL: https://github.com/apache/hudi/pull/1810#discussion_r465458394



##
File path: hudi-spark/src/main/scala/org/apache/hudi/DataSourceOptions.scala
##
@@ -258,11 +258,14 @@ object DataSourceWriteOptions {
 */
   val STREAMING_IGNORE_FAILED_BATCH_OPT_KEY = 
"hoodie.datasource.write.streaming.ignore.failed.batch"
   val DEFAULT_STREAMING_IGNORE_FAILED_BATCH_OPT_VAL = "true"
+  val SYNC_CLIENT_TOOL_CLASS = "hoodie.sync.client.tool.class"
+  val DEFAULT_SYNC_CLIENT_TOOL_CLASS = "org.apache.hudi.hive.HiveSyncTool"

Review comment:
   done





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [hudi] zhedoubushishi commented on a change in pull request #1869: [HUDI-427] Implement CLI support for performing bootstrap

2020-08-04 Thread GitBox


zhedoubushishi commented on a change in pull request #1869:
URL: https://github.com/apache/hudi/pull/1869#discussion_r465457779



##
File path: 
hudi-cli/src/main/java/org/apache/hudi/cli/commands/BootstrapCommand.java
##
@@ -0,0 +1,189 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.cli.commands;
+
+import org.apache.hudi.avro.model.BootstrapIndexInfo;
+import org.apache.hudi.cli.HoodieCLI;
+import org.apache.hudi.cli.HoodiePrintHelper;
+import org.apache.hudi.cli.TableHeader;
+import org.apache.hudi.cli.commands.SparkMain.SparkCommand;
+import org.apache.hudi.cli.utils.InputStreamConsumer;
+import org.apache.hudi.cli.utils.SparkUtil;
+import org.apache.hudi.common.model.BootstrapSourceFileMapping;
+import org.apache.hudi.common.model.HoodieFileGroupId;
+import org.apache.hudi.common.table.HoodieTableMetaClient;
+import org.apache.hudi.utilities.UtilHelpers;
+
+import org.apache.spark.launcher.SparkLauncher;
+import org.apache.spark.util.Utils;
+import org.springframework.shell.core.CommandMarker;
+import org.springframework.shell.core.annotation.CliCommand;
+import org.springframework.shell.core.annotation.CliOption;
+import org.springframework.stereotype.Component;
+
+import java.io.IOException;
+import java.net.URISyntaxException;
+import java.util.Arrays;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.stream.Collectors;
+
+import scala.collection.JavaConverters;
+
+/**
+ * CLI command to perform bootstrap action & display bootstrap index.
+ */
+@Component
+public class BootstrapCommand implements CommandMarker {
+
+  @CliCommand(value = "bootstrap run", help = "Run a bootstrap action for 
current Hudi table")
+  public String bootstrap(

Review comment:
   I reconsidered about this issue. I agreed that we can run 
```bootstrap``` as what ```HDFSParquetImportCommand``` does. So we only need to 
connect to the table when we want to get index info.





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [hudi] zhedoubushishi commented on a change in pull request #1869: [HUDI-427] Implement CLI support for performing bootstrap

2020-08-04 Thread GitBox


zhedoubushishi commented on a change in pull request #1869:
URL: https://github.com/apache/hudi/pull/1869#discussion_r465457434



##
File path: 
hudi-cli/src/test/java/org/apache/hudi/cli/integ/ITTestBootstrapCommand.java
##
@@ -0,0 +1,108 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.cli.integ;
+
+import org.apache.hudi.cli.HoodieCLI;
+import org.apache.hudi.cli.HoodiePrintHelper;
+import org.apache.hudi.cli.commands.TableCommand;
+import org.apache.hudi.cli.testutils.AbstractShellIntegrationTest;
+import org.apache.hudi.common.model.HoodieTableType;
+import org.apache.hudi.common.table.timeline.versioning.TimelineLayoutVersion;
+import org.apache.hudi.testutils.HoodieTestDataGenerator;
+
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.SaveMode;
+import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.Test;
+import org.springframework.shell.core.CommandResult;
+
+import java.io.File;
+import java.io.IOException;
+import java.time.Instant;
+import java.util.Arrays;
+import java.util.List;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+/**
+ * Test class of {@link org.apache.hudi.cli.commands.BootstrapCommand}.
+ */
+public class ITTestBootstrapCommand extends AbstractShellIntegrationTest {
+
+  private static final int TOTAL_RECORDS = 100;
+  private static final String PARTITION_FIELD = "datestr";
+  private static final String RECORD_KEY_FIELD = "_row_key";
+
+  private String sourcePath;
+  private String tablePath;
+  private List partitions;
+
+  @BeforeEach
+  public void init() throws IOException {
+String srcName = "source";
+String tableName = "test-table";
+sourcePath = basePath + File.separator + srcName;
+tablePath = basePath + File.separator + tableName;
+
+partitions = Arrays.asList("2018", "2019", "2020");
+double timestamp = new Double(Instant.now().toEpochMilli()).longValue();
+Dataset df = 
HoodieTestDataGenerator.generateTestRawTripDataset(timestamp,
+TOTAL_RECORDS, partitions, jsc, sqlContext);
+
df.write().partitionBy("datestr").format("parquet").mode(SaveMode.Overwrite).save(sourcePath);
+
+// Create table and connect
+new TableCommand().createTable(
+tablePath, tableName, HoodieTableType.COPY_ON_WRITE.name(),
+"", TimelineLayoutVersion.VERSION_1, 
"org.apache.hudi.common.model.HoodieAvroPayload",
+"org.apache.hudi.common.bootstrap.index.HFileBasedBootstrapIndex");
+  }
+
+  /**
+   * Test case for command 'bootstrap'.
+   */
+  @Test
+  public void testBootstrapRunCommand() throws IOException {
+// test bootstrap run command
+String cmdStr = String.format("bootstrap run --sourcePath %s 
--recordKeyColumns %s --partitionFields %s --sparkMaster %s",
+sourcePath, RECORD_KEY_FIELD, PARTITION_FIELD, "local");
+CommandResult cr = getShell().executeCommand(cmdStr);
+assertTrue(cr.isSuccess());
+
+// Check hudi table exist
+new TableCommand().connect(tablePath, TimelineLayoutVersion.VERSION_1, 
false, 2000, 30, 7);
+metaClient = HoodieCLI.getTableMetaClient();
+assertEquals(1, 
metaClient.getActiveTimeline().getCommitsTimeline().countInstants(), "Should 
have 1 commit.");
+
+// test bootstrap show indexed partitions

Review comment:
   Added another two test cases for ```bootstrap index showMapping```.





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [hudi] FelixKJose commented on issue #1895: HUDI Dataset backed by Hive Metastore fails on Presto with Unknown converted type TIMESTAMP_MICROS

2020-08-04 Thread GitBox


FelixKJose commented on issue #1895:
URL: https://github.com/apache/hudi/issues/1895#issuecomment-668963672


   Could someone help me or provide some insights?



This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [hudi] lw309637554 commented on a change in pull request #1810: [HUDI-875] Abstract hudi-sync-common, and support hudi-hive-sync

2020-08-04 Thread GitBox


lw309637554 commented on a change in pull request #1810:
URL: https://github.com/apache/hudi/pull/1810#discussion_r465450971



##
File path: 
hudi-utilities/src/main/java/org/apache/hudi/utilities/deltastreamer/DeltaSync.java
##
@@ -475,12 +480,38 @@ private String startCommit() {
 throw lastException;
   }
 
-  /**
-   * Sync to Hive.
-   */
-  public void syncHiveIfNeeded() {
+  private void syncMeta(HoodieDeltaStreamerMetrics metrics) {
+String syncClientToolClass = cfg.syncClientToolClass;
+// for backward compatibility
 if (cfg.enableHiveSync) {
-  syncHive();
+  cfg.enableMetaSync = true;
+  syncClientToolClass = String.format("%s,%s", cfg.syncClientToolClass, 
"org.apache.hudi.hive.HiveSyncTool");

Review comment:
   done





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [hudi] lw309637554 commented on a change in pull request #1810: [HUDI-875] Abstract hudi-sync-common, and support hudi-hive-sync

2020-08-04 Thread GitBox


lw309637554 commented on a change in pull request #1810:
URL: https://github.com/apache/hudi/pull/1810#discussion_r465450351



##
File path: 
hudi-sync/hudi-sync-common/src/main/java/org/apache/hudi/sync/common/AbstractSyncHoodieClient.java
##
@@ -0,0 +1,189 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.sync.common;
+
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hudi.common.model.HoodieCommitMetadata;
+import org.apache.hudi.common.model.HoodieTableType;
+import org.apache.hudi.common.table.HoodieTableMetaClient;
+import org.apache.hudi.common.table.TableSchemaResolver;
+import org.apache.hudi.common.table.timeline.HoodieTimeline;
+import org.apache.hudi.common.table.timeline.HoodieInstant;
+import org.apache.hudi.common.fs.FSUtils;
+import org.apache.hudi.common.util.Option;
+import org.apache.hudi.exception.HoodieIOException;
+
+import org.apache.log4j.LogManager;
+import org.apache.log4j.Logger;
+import org.apache.parquet.schema.MessageType;
+
+import java.io.IOException;
+import java.sql.ResultSet;
+import java.sql.SQLException;
+import java.sql.Statement;
+import java.util.List;
+import java.util.Map;
+import java.util.stream.Collectors;
+
+public abstract class AbstractSyncHoodieClient {
+  private static final Logger LOG = 
LogManager.getLogger(AbstractSyncHoodieClient.class);
+  protected final HoodieTableMetaClient metaClient;
+  protected HoodieTimeline activeTimeline;
+  protected final HoodieTableType tableType;
+  protected final FileSystem fs;
+  private String basePath;
+  private boolean assumeDatePartitioning;
+
+  public AbstractSyncHoodieClient(String basePath, boolean 
assumeDatePartitioning, FileSystem fs) {
+this.metaClient = new HoodieTableMetaClient(fs.getConf(), basePath, true);
+this.tableType = metaClient.getTableType();
+this.basePath = basePath;
+this.assumeDatePartitioning = assumeDatePartitioning;
+this.fs = fs;
+this.activeTimeline = 
metaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants();
+  }
+
+  public abstract void createTable(String tableName, MessageType storageSchema,
+   String inputFormatClass, String 
outputFormatClass, String serdeClass);
+
+  public abstract boolean doesTableExist(String tableName);
+
+  public abstract Option getLastCommitTimeSynced(String tableName);
+
+  public abstract void updateLastCommitTimeSynced(String tableName);
+
+  public abstract void addPartitionsToTable(String tableName, List 
partitionsToAdd);
+
+  public abstract void updatePartitionsToTable(String tableName, List 
changedPartitions);
+
+  public abstract Map getTableSchema(String tableName);
+
+  public HoodieTimeline getActiveTimeline() {
+return activeTimeline;
+  }
+
+  public HoodieTableType getTableType() {
+return tableType;
+  }
+
+  public String getBasePath() {
+return metaClient.getBasePath();
+  }
+
+  public FileSystem getFs() {
+return fs;
+  }
+
+  public void closeQuietly(ResultSet resultSet, Statement stmt) {
+try {
+  if (stmt != null) {
+stmt.close();
+  }
+} catch (SQLException e) {
+  LOG.error("Could not close the statement opened ", e);
+}
+
+try {
+  if (resultSet != null) {
+resultSet.close();
+  }
+} catch (SQLException e) {
+  LOG.error("Could not close the resultset opened ", e);

Review comment:
   done





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [hudi] lw309637554 commented on a change in pull request #1810: [HUDI-875] Abstract hudi-sync-common, and support hudi-hive-sync

2020-08-04 Thread GitBox


lw309637554 commented on a change in pull request #1810:
URL: https://github.com/apache/hudi/pull/1810#discussion_r465449363



##
File path: 
hudi-sync/hudi-sync-common/src/main/java/org/apache/hudi/sync/common/AbstractSyncHoodieClient.java
##
@@ -0,0 +1,189 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.sync.common;
+
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hudi.common.model.HoodieCommitMetadata;
+import org.apache.hudi.common.model.HoodieTableType;
+import org.apache.hudi.common.table.HoodieTableMetaClient;
+import org.apache.hudi.common.table.TableSchemaResolver;
+import org.apache.hudi.common.table.timeline.HoodieTimeline;
+import org.apache.hudi.common.table.timeline.HoodieInstant;
+import org.apache.hudi.common.fs.FSUtils;
+import org.apache.hudi.common.util.Option;
+import org.apache.hudi.exception.HoodieIOException;
+
+import org.apache.log4j.LogManager;
+import org.apache.log4j.Logger;
+import org.apache.parquet.schema.MessageType;
+
+import java.io.IOException;
+import java.sql.ResultSet;
+import java.sql.SQLException;
+import java.sql.Statement;
+import java.util.List;
+import java.util.Map;
+import java.util.stream.Collectors;
+
+public abstract class AbstractSyncHoodieClient {
+  private static final Logger LOG = 
LogManager.getLogger(AbstractSyncHoodieClient.class);
+  protected final HoodieTableMetaClient metaClient;
+  protected HoodieTimeline activeTimeline;
+  protected final HoodieTableType tableType;
+  protected final FileSystem fs;
+  private String basePath;
+  private boolean assumeDatePartitioning;
+
+  public AbstractSyncHoodieClient(String basePath, boolean 
assumeDatePartitioning, FileSystem fs) {
+this.metaClient = new HoodieTableMetaClient(fs.getConf(), basePath, true);
+this.tableType = metaClient.getTableType();
+this.basePath = basePath;
+this.assumeDatePartitioning = assumeDatePartitioning;
+this.fs = fs;
+this.activeTimeline = 
metaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants();
+  }
+
+  public abstract void createTable(String tableName, MessageType storageSchema,
+   String inputFormatClass, String 
outputFormatClass, String serdeClass);
+
+  public abstract boolean doesTableExist(String tableName);
+
+  public abstract Option getLastCommitTimeSynced(String tableName);
+
+  public abstract void updateLastCommitTimeSynced(String tableName);
+
+  public abstract void addPartitionsToTable(String tableName, List 
partitionsToAdd);
+
+  public abstract void updatePartitionsToTable(String tableName, List 
changedPartitions);
+
+  public abstract Map getTableSchema(String tableName);
+
+  public HoodieTimeline getActiveTimeline() {
+return activeTimeline;
+  }
+
+  public HoodieTableType getTableType() {
+return tableType;
+  }
+
+  public String getBasePath() {
+return metaClient.getBasePath();
+  }
+
+  public FileSystem getFs() {
+return fs;
+  }
+
+  public void closeQuietly(ResultSet resultSet, Statement stmt) {
+try {
+  if (stmt != null) {
+stmt.close();
+  }
+} catch (SQLException e) {
+  LOG.error("Could not close the statement opened ", e);

Review comment:
   done





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [hudi] lw309637554 commented on a change in pull request #1810: [HUDI-875] Abstract hudi-sync-common, and support hudi-hive-sync

2020-08-04 Thread GitBox


lw309637554 commented on a change in pull request #1810:
URL: https://github.com/apache/hudi/pull/1810#discussion_r465449064



##
File path: 
hudi-sync/hudi-dla-sync/src/main/java/org/apache/hudi/dla/DLASyncTool.java
##
@@ -0,0 +1,211 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.dla;
+
+import com.beust.jcommander.JCommander;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat;
+import org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe;
+import org.apache.hudi.common.fs.FSUtils;
+import org.apache.hudi.common.util.Option;
+import org.apache.hudi.dla.util.Utils;
+import org.apache.hudi.exception.HoodieException;
+import org.apache.hudi.exception.InvalidTableException;
+import org.apache.hudi.hadoop.HoodieParquetInputFormat;
+import org.apache.hudi.hadoop.realtime.HoodieParquetRealtimeInputFormat;
+import org.apache.hudi.hive.SchemaDifference;
+import org.apache.hudi.hive.util.HiveSchemaUtil;
+import org.apache.hudi.sync.common.AbstractSyncHoodieClient;
+import org.apache.hudi.sync.common.AbstractSyncTool;
+import org.apache.log4j.LogManager;
+import org.apache.log4j.Logger;
+import org.apache.parquet.schema.MessageType;
+
+import java.util.List;
+import java.util.Map;
+import java.util.Properties;
+import java.util.stream.Collectors;
+
+/**
+ * Tool to sync a hoodie table with a dla table. Either use it as a api
+ * DLASyncTool.syncHoodieTable(DLASyncConfig) or as a command line java -cp 
hoodie-hive.jar DLASyncTool [args]
+ * 
+ * This utility will get the schema from the latest commit and will sync dla 
table schema Also this will sync the
+ * partitions incrementally (all the partitions modified since the last commit)
+ */
+@SuppressWarnings("WeakerAccess")
+public class DLASyncTool extends AbstractSyncTool {
+
+  private static final Logger LOG = LogManager.getLogger(DLASyncTool.class);
+  public static final String SUFFIX_SNAPSHOT_TABLE = "_rt";
+  public static final String SUFFIX_READ_OPTIMIZED_TABLE = "_ro";
+
+  private final DLASyncConfig cfg;
+  private final HoodieDLAClient hoodieDLAClient;
+  private final String snapshotTableName;
+  private final Option roTableTableName;
+
+  public DLASyncTool(Properties properties, FileSystem fs) {
+super(properties, fs);
+this.hoodieDLAClient = new 
HoodieDLAClient(Utils.propertiesToConfig(properties), fs);
+this.cfg = Utils.propertiesToConfig(properties);
+switch (hoodieDLAClient.getTableType()) {
+  case COPY_ON_WRITE:
+this.snapshotTableName = cfg.tableName;
+this.roTableTableName = Option.empty();
+break;
+  case MERGE_ON_READ:
+this.snapshotTableName = cfg.tableName + SUFFIX_SNAPSHOT_TABLE;
+this.roTableTableName = cfg.skipROSuffix ? Option.of(cfg.tableName) :
+Option.of(cfg.tableName + SUFFIX_READ_OPTIMIZED_TABLE);
+break;
+  default:
+LOG.error("Unknown table type " + hoodieDLAClient.getTableType());
+throw new InvalidTableException(hoodieDLAClient.getBasePath());
+}
+  }
+
+  @Override
+  public void syncHoodieTable() {
+try {
+  switch (hoodieDLAClient.getTableType()) {
+case COPY_ON_WRITE:
+  syncHoodieTable(snapshotTableName, false);
+  break;
+case MERGE_ON_READ:
+  // sync a RO table for MOR
+  syncHoodieTable(roTableTableName.get(), false);
+  // sync a RT table for MOR
+  syncHoodieTable(snapshotTableName, true);
+  break;
+default:
+  LOG.error("Unknown table type " + hoodieDLAClient.getTableType());
+  throw new InvalidTableException(hoodieDLAClient.getBasePath());
+  }
+} catch (RuntimeException re) {
+  LOG.error("Got runtime exception when dla syncing", re);
+} finally {
+  hoodieDLAClient.close();
+}
+  }
+
+  private void syncHoodieTable(String tableName, boolean 
useRealtimeInputFormat) {
+LOG.info("Trying to sync hoodie table " + tableName + " with base path " + 
hoodieDLAClient.getBasePath()
++ " of type " + hoodieDLAClient.getTableType());
+// Check if the 

[GitHub] [hudi] lw309637554 commented on a change in pull request #1810: [HUDI-875] Abstract hudi-sync-common, and support hudi-hive-sync

2020-08-04 Thread GitBox


lw309637554 commented on a change in pull request #1810:
URL: https://github.com/apache/hudi/pull/1810#discussion_r465449225



##
File path: 
hudi-sync/hudi-dla-sync/src/main/java/org/apache/hudi/dla/HoodieDLAClient.java
##
@@ -0,0 +1,403 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.dla;
+
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hudi.common.fs.FSUtils;
+import org.apache.hudi.common.util.Option;
+import org.apache.hudi.common.util.StringUtils;
+import org.apache.hudi.common.util.ValidationUtils;
+import org.apache.hudi.exception.HoodieException;
+import org.apache.hudi.hive.HiveSyncConfig;
+import org.apache.hudi.hive.HoodieHiveSyncException;
+import org.apache.hudi.hive.PartitionValueExtractor;
+import org.apache.hudi.hive.SchemaDifference;
+import org.apache.hudi.hive.util.HiveSchemaUtil;
+import org.apache.hudi.sync.common.AbstractSyncHoodieClient;
+import org.apache.log4j.LogManager;
+import org.apache.log4j.Logger;
+import org.apache.parquet.schema.MessageType;
+
+import java.io.IOException;
+import java.sql.Connection;
+import java.sql.DriverManager;
+import java.sql.DatabaseMetaData;
+import java.sql.ResultSet;
+import java.sql.SQLException;
+import java.sql.Statement;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.stream.Collectors;
+
+public class HoodieDLAClient extends AbstractSyncHoodieClient {
+  private static final Logger LOG = 
LogManager.getLogger(HoodieDLAClient.class);
+  private static final String HOODIE_LAST_COMMIT_TIME_SYNC = 
"hoodie_last_sync";
+  // Make sure we have the dla JDBC driver in classpath
+  private static final String DRIVER_NAME = "com.mysql.jdbc.Driver";
+  private static final String DLA_ESCAPE_CHARACTER = "";
+  private static final String TBL_PROPERTIES_STR = "TBLPROPERTIES";
+
+  static {
+try {
+  Class.forName(DRIVER_NAME);
+} catch (ClassNotFoundException e) {
+  throw new IllegalStateException("Could not find " + DRIVER_NAME + " in 
classpath. ", e);
+}
+  }
+
+  private Connection connection;
+  private DLASyncConfig dlaConfig;
+  private PartitionValueExtractor partitionValueExtractor;
+
+  public HoodieDLAClient(DLASyncConfig syncConfig, FileSystem fs) {
+super(syncConfig.basePath, syncConfig.assumeDatePartitioning, fs);
+this.dlaConfig = syncConfig;
+try {
+  this.partitionValueExtractor =
+  (PartitionValueExtractor) 
Class.forName(dlaConfig.partitionValueExtractorClass).newInstance();
+} catch (Exception e) {
+  throw new HoodieException(
+  "Failed to initialize PartitionValueExtractor class " + 
dlaConfig.partitionValueExtractorClass, e);
+}
+createDLAConnection();
+  }
+
+  private void createDLAConnection() {
+if (connection == null) {
+  try {
+Class.forName(DRIVER_NAME);
+  } catch (ClassNotFoundException e) {
+LOG.error("Unable to load DLA driver class", e);
+return;
+  }
+  try {
+this.connection = DriverManager.getConnection(dlaConfig.jdbcUrl, 
dlaConfig.dlaUser, dlaConfig.dlaPass);
+LOG.info("Successfully established DLA connection to  " + 
dlaConfig.jdbcUrl);
+  } catch (SQLException e) {
+throw new HoodieException("Cannot create dla connection ", e);
+  }
+}
+  }
+
+  @Override
+  public void createTable(String tableName, MessageType storageSchema, String 
inputFormatClass, String outputFormatClass, String serdeClass) {
+try {
+  String createSQLQuery = HiveSchemaUtil.generateCreateDDL(tableName, 
storageSchema, toHiveSyncConfig(), inputFormatClass, outputFormatClass, 
serdeClass);
+  LOG.info("Creating table with " + createSQLQuery);
+  updateDLASQL(createSQLQuery);
+} catch (IOException e) {
+  throw new HoodieException("Failed to create table " + tableName, e);
+}
+  }
+
+  public Map getTableSchema(String tableName) {
+if (!doesTableExist(tableName)) {
+  throw new IllegalArgumentException(
+  "Failed to get schema for table " + tableName + " does not exist");
+}
+Map 

[GitHub] [hudi] yanghua commented on pull request #1115: [HUDI-392] Introduce DIstributedTestDataSource to generate test data

2020-08-04 Thread GitBox


yanghua commented on pull request #1115:
URL: https://github.com/apache/hudi/pull/1115#issuecomment-668957475


   > @yanghua Is it okay to close this now ?
   
   Yes, closing...



This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [hudi] yanghua closed pull request #1115: [HUDI-392] Introduce DIstributedTestDataSource to generate test data

2020-08-04 Thread GitBox


yanghua closed pull request #1115:
URL: https://github.com/apache/hudi/pull/1115


   



This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [hudi] lw309637554 commented on a change in pull request #1810: [HUDI-875] Abstract hudi-sync-common, and support hudi-hive-sync

2020-08-04 Thread GitBox


lw309637554 commented on a change in pull request #1810:
URL: https://github.com/apache/hudi/pull/1810#discussion_r465448424



##
File path: hudi-spark/src/main/scala/org/apache/hudi/HoodieSparkSqlWriter.scala
##
@@ -261,6 +268,44 @@ private[hudi] object HoodieSparkSqlWriter {
 hiveSyncConfig
   }
 
+  private def metaSync(parameters: Map[String, String],
+   basePath: Path,
+   hadoopConf: Configuration): Boolean = {
+val hiveSyncEnabled = parameters.get(HIVE_SYNC_ENABLED_OPT_KEY).exists(r 
=> r.toBoolean)
+var metaSyncEnabled = parameters.get(HUDI_SYNC_ENABLED_OPT_KEY).exists(r 
=> r.toBoolean)
+var syncClientToolClass = parameters(SYNC_CLIENT_TOOL_CLASS)
+// for backward compatibility
+if (hiveSyncEnabled) {
+  metaSyncEnabled = true
+  syncClientToolClass = String.format("%s,%s", syncClientToolClass, 
"org.apache.hudi.hive.HiveSyncTool")

Review comment:
   done





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [hudi] bigmisspanda commented on issue #1613: [SUPPORT] Does hudi support change column for spark dataframe OR alter hudi table add a column?

2020-08-04 Thread GitBox


bigmisspanda commented on issue #1613:
URL: https://github.com/apache/hudi/issues/1613#issuecomment-668953188


   我用spark dataframe 增加新的列写入hudi后,查询不到新增加的列。这是为什么呢



This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [hudi] bvaradar commented on pull request #1752: [HUDI-575] Support Async Compaction for spark streaming writes to hudi table

2020-08-04 Thread GitBox


bvaradar commented on pull request #1752:
URL: https://github.com/apache/hudi/pull/1752#issuecomment-668947071


   For the remaining 2 questions, here is the answer: 
   
   what if the user sets the writeClient config for `inline` = false and does 
not set async compaction datasource option? should we control at a single 
level..  ?
   After discussion, we decided to have Async compaction be enabled by default 
for MOR table. If async compaction is disabled by config, inline compaction 
will be enabled automatically.
   
   basic question.. if there are no writes , no compaction gets scheduled 
right? so async compaction is a no-op i.e it will check if there is some work 
to do, if not won't trigger anything?
   Yes, that is correct.
   



This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [hudi] lw309637554 commented on a change in pull request #1810: [HUDI-875] Abstract hudi-sync-common, and support hudi-hive-sync

2020-08-04 Thread GitBox


lw309637554 commented on a change in pull request #1810:
URL: https://github.com/apache/hudi/pull/1810#discussion_r465434113



##
File path: 
hudi-utilities/src/main/java/org/apache/hudi/utilities/deltastreamer/HoodieDeltaStreamer.java
##
@@ -267,9 +267,16 @@ public Operation convert(String value) throws 
ParameterException {
 description = "Should duplicate records from source be 
dropped/filtered out before insert/bulk-insert")
 public Boolean filterDupes = false;
 
+//will abandon in the future version, recommended use --enable-sync
 @Parameter(names = {"--enable-hive-sync"}, description = "Enable syncing 
to hive")
 public Boolean enableHiveSync = false;
 
+@Parameter(names = {"--enable-sync"}, description = "Enable syncing meta")
+public Boolean enableMetaSync = false;
+
+@Parameter(names = {"--sync-tool-classes"}, description = "Meta sync 
client tool, using comma to separate multi tools")
+public String syncClientToolClass = "org.apache.hudi.hive.HiveSyncTool";

Review comment:
   done





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [hudi] lw309637554 commented on a change in pull request #1810: [HUDI-875] Abstract hudi-sync-common, and support hudi-hive-sync

2020-08-04 Thread GitBox


lw309637554 commented on a change in pull request #1810:
URL: https://github.com/apache/hudi/pull/1810#discussion_r465433316



##
File path: hudi-spark/src/main/scala/org/apache/hudi/HoodieSparkSqlWriter.scala
##
@@ -261,6 +268,44 @@ private[hudi] object HoodieSparkSqlWriter {
 hiveSyncConfig
   }
 
+  private def metaSync(parameters: Map[String, String],
+   basePath: Path,
+   hadoopConf: Configuration): Boolean = {
+val hiveSyncEnabled = parameters.get(HIVE_SYNC_ENABLED_OPT_KEY).exists(r 
=> r.toBoolean)
+var metaSyncEnabled = parameters.get(HUDI_SYNC_ENABLED_OPT_KEY).exists(r 
=> r.toBoolean)

Review comment:
   done





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [hudi] mingujotemp commented on issue #1910: [SUPPORT] Upsert operation duplicating records in a partition

2020-08-04 Thread GitBox


mingujotemp commented on issue #1910:
URL: https://github.com/apache/hudi/issues/1910#issuecomment-668936259


   @bvaradar I'm reading the table with `spark.sql` through hive metastore not 
directly reading parquet using `spark.read.format("hudi")`



This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [hudi] mingujotemp commented on issue #1909: [SUPPORT] "Failed to get update last commit time synced to 20200804071144"

2020-08-04 Thread GitBox


mingujotemp commented on issue #1909:
URL: https://github.com/apache/hudi/issues/1909#issuecomment-668934461


   @bvaradar could you elaborate more? which part on hive-conf are you 
describing? is it hive-conf.xml on emr or hive configuration for hudi? 



This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [hudi] shenh062326 commented on a change in pull request #1868: [HUDI-1083] Optimization in determining insert bucket location for a given key

2020-08-04 Thread GitBox


shenh062326 commented on a change in pull request #1868:
URL: https://github.com/apache/hudi/pull/1868#discussion_r465428805



##
File path: 
hudi-client/src/test/java/org/apache/hudi/table/action/commit/TestUpsertPartitioner.java
##
@@ -252,8 +250,27 @@ public void testUpsertPartitionerWithSmallInsertHandling() 
throws Exception {
 assertEquals(BucketType.INSERT, partitioner.getBucketInfo(2).bucketType,
 "Bucket 2 is INSERT");
 assertEquals(3, insertBuckets.size(), "Total of 3 insert buckets");
-assertEquals(0, insertBuckets.get(0).bucketNumber, "First insert bucket 
must be same as update bucket");
-assertEquals(0.5, insertBuckets.get(0).weight, 0.01, "First insert bucket 
should have weight 0.5");
+
+assertEquals(0, insertBuckets.get(0).getKey().bucketNumber,
+"First insert bucket must be same as update bucket");

Review comment:
   Got it.





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [hudi] shenh062326 commented on a change in pull request #1868: [HUDI-1083] Optimization in determining insert bucket location for a given key

2020-08-04 Thread GitBox


shenh062326 commented on a change in pull request #1868:
URL: https://github.com/apache/hudi/pull/1868#discussion_r465428805



##
File path: 
hudi-client/src/test/java/org/apache/hudi/table/action/commit/TestUpsertPartitioner.java
##
@@ -252,8 +250,27 @@ public void testUpsertPartitionerWithSmallInsertHandling() 
throws Exception {
 assertEquals(BucketType.INSERT, partitioner.getBucketInfo(2).bucketType,
 "Bucket 2 is INSERT");
 assertEquals(3, insertBuckets.size(), "Total of 3 insert buckets");
-assertEquals(0, insertBuckets.get(0).bucketNumber, "First insert bucket 
must be same as update bucket");
-assertEquals(0.5, insertBuckets.get(0).weight, 0.01, "First insert bucket 
should have weight 0.5");
+
+assertEquals(0, insertBuckets.get(0).getKey().bucketNumber,
+"First insert bucket must be same as update bucket");

Review comment:
   got it.





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [hudi] shenh062326 commented on pull request #1819: [HUDI-1058] Make delete marker configurable

2020-08-04 Thread GitBox


shenh062326 commented on pull request #1819:
URL: https://github.com/apache/hudi/pull/1819#issuecomment-668930304


   Let me confirm the new implementation again, adding a field containing 
transient to OverwriteWithLatestAvroPayload, like below:
   ```
 private transient String deleteMarkerField = null;
   
 public void setDeleteMarkerField(String deleteMarkerField) {
   this.deleteMarkerField = deleteMarkerField;
 }
   ```
   
   And set the deleteMarkerField before HoodieMergeHandle.write call 
OverwriteWithLatestAvroPayload.combineAndGetUpdateValue, right?
   
   
   
   



This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[jira] [Commented] (HUDI-1112) Blog on Tracking Hudi Data along transaction time and buisness time

2020-08-04 Thread leesf (Jira)


[ 
https://issues.apache.org/jira/browse/HUDI-1112?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=17171214#comment-17171214
 ] 

leesf commented on HUDI-1112:
-

[~nandini57] Assigned to you.

> Blog on Tracking Hudi Data along transaction time and buisness time
> ---
>
> Key: HUDI-1112
> URL: https://issues.apache.org/jira/browse/HUDI-1112
> Project: Apache Hudi
>  Issue Type: Task
>  Components: Docs
>Reporter: Vinoth Chandar
>Assignee: Sandeep Maji
>Priority: Major
> Fix For: 0.6.0
>
>
> https://github.com/apache/hudi/issues/1705



--
This message was sent by Atlassian Jira
(v8.3.4#803005)


[jira] [Assigned] (HUDI-1112) Blog on Tracking Hudi Data along transaction time and buisness time

2020-08-04 Thread leesf (Jira)


 [ 
https://issues.apache.org/jira/browse/HUDI-1112?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

leesf reassigned HUDI-1112:
---

Assignee: Sandeep Maji

> Blog on Tracking Hudi Data along transaction time and buisness time
> ---
>
> Key: HUDI-1112
> URL: https://issues.apache.org/jira/browse/HUDI-1112
> Project: Apache Hudi
>  Issue Type: Task
>  Components: Docs
>Reporter: Vinoth Chandar
>Assignee: Sandeep Maji
>Priority: Major
> Fix For: 0.6.0
>
>
> https://github.com/apache/hudi/issues/1705



--
This message was sent by Atlassian Jira
(v8.3.4#803005)


[GitHub] [hudi] leesf commented on a change in pull request #1810: [HUDI-875] Abstract hudi-sync-common, and support hudi-hive-sync

2020-08-04 Thread GitBox


leesf commented on a change in pull request #1810:
URL: https://github.com/apache/hudi/pull/1810#discussion_r465418437



##
File path: hudi-spark/src/main/scala/org/apache/hudi/HoodieSparkSqlWriter.scala
##
@@ -261,6 +268,44 @@ private[hudi] object HoodieSparkSqlWriter {
 hiveSyncConfig
   }
 
+  private def metaSync(parameters: Map[String, String],
+   basePath: Path,
+   hadoopConf: Configuration): Boolean = {
+val hiveSyncEnabled = parameters.get(HIVE_SYNC_ENABLED_OPT_KEY).exists(r 
=> r.toBoolean)
+var metaSyncEnabled = parameters.get(HUDI_SYNC_ENABLED_OPT_KEY).exists(r 
=> r.toBoolean)
+var syncClientToolClass = parameters(SYNC_CLIENT_TOOL_CLASS)
+// for backward compatibility
+if (hiveSyncEnabled) {
+  metaSyncEnabled = true
+  syncClientToolClass = String.format("%s,%s", syncClientToolClass, 
"org.apache.hudi.hive.HiveSyncTool")

Review comment:
   ditto





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [hudi] leesf commented on a change in pull request #1810: [HUDI-875] Abstract hudi-sync-common, and support hudi-hive-sync

2020-08-04 Thread GitBox


leesf commented on a change in pull request #1810:
URL: https://github.com/apache/hudi/pull/1810#discussion_r465418366



##
File path: hudi-spark/src/main/scala/org/apache/hudi/DataSourceOptions.scala
##
@@ -258,11 +258,14 @@ object DataSourceWriteOptions {
 */
   val STREAMING_IGNORE_FAILED_BATCH_OPT_KEY = 
"hoodie.datasource.write.streaming.ignore.failed.batch"
   val DEFAULT_STREAMING_IGNORE_FAILED_BATCH_OPT_VAL = "true"
+  val SYNC_CLIENT_TOOL_CLASS = "hoodie.sync.client.tool.class"
+  val DEFAULT_SYNC_CLIENT_TOOL_CLASS = "org.apache.hudi.hive.HiveSyncTool"

Review comment:
   use HiveSyncTool.class.getName?





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [hudi] leesf commented on a change in pull request #1810: [HUDI-875] Abstract hudi-sync-common, and support hudi-hive-sync

2020-08-04 Thread GitBox


leesf commented on a change in pull request #1810:
URL: https://github.com/apache/hudi/pull/1810#discussion_r465417539



##
File path: 
hudi-utilities/src/main/java/org/apache/hudi/utilities/deltastreamer/DeltaSync.java
##
@@ -475,12 +480,38 @@ private String startCommit() {
 throw lastException;
   }
 
-  /**
-   * Sync to Hive.
-   */
-  public void syncHiveIfNeeded() {
+  private void syncMeta(HoodieDeltaStreamerMetrics metrics) {
+String syncClientToolClass = cfg.syncClientToolClass;
+// for backward compatibility
 if (cfg.enableHiveSync) {
-  syncHive();
+  cfg.enableMetaSync = true;
+  syncClientToolClass = String.format("%s,%s", cfg.syncClientToolClass, 
"org.apache.hudi.hive.HiveSyncTool");

Review comment:
   use HiveSyncTool.class.getName here?





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [hudi] leesf commented on a change in pull request #1810: [HUDI-875] Abstract hudi-sync-common, and support hudi-hive-sync

2020-08-04 Thread GitBox


leesf commented on a change in pull request #1810:
URL: https://github.com/apache/hudi/pull/1810#discussion_r465417053



##
File path: 
hudi-sync/hudi-sync-common/src/main/java/org/apache/hudi/sync/common/AbstractSyncHoodieClient.java
##
@@ -0,0 +1,189 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.sync.common;
+
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hudi.common.model.HoodieCommitMetadata;
+import org.apache.hudi.common.model.HoodieTableType;
+import org.apache.hudi.common.table.HoodieTableMetaClient;
+import org.apache.hudi.common.table.TableSchemaResolver;
+import org.apache.hudi.common.table.timeline.HoodieTimeline;
+import org.apache.hudi.common.table.timeline.HoodieInstant;
+import org.apache.hudi.common.fs.FSUtils;
+import org.apache.hudi.common.util.Option;
+import org.apache.hudi.exception.HoodieIOException;
+
+import org.apache.log4j.LogManager;
+import org.apache.log4j.Logger;
+import org.apache.parquet.schema.MessageType;
+
+import java.io.IOException;
+import java.sql.ResultSet;
+import java.sql.SQLException;
+import java.sql.Statement;
+import java.util.List;
+import java.util.Map;
+import java.util.stream.Collectors;
+
+public abstract class AbstractSyncHoodieClient {
+  private static final Logger LOG = 
LogManager.getLogger(AbstractSyncHoodieClient.class);
+  protected final HoodieTableMetaClient metaClient;
+  protected HoodieTimeline activeTimeline;
+  protected final HoodieTableType tableType;
+  protected final FileSystem fs;
+  private String basePath;
+  private boolean assumeDatePartitioning;
+
+  public AbstractSyncHoodieClient(String basePath, boolean 
assumeDatePartitioning, FileSystem fs) {
+this.metaClient = new HoodieTableMetaClient(fs.getConf(), basePath, true);
+this.tableType = metaClient.getTableType();
+this.basePath = basePath;
+this.assumeDatePartitioning = assumeDatePartitioning;
+this.fs = fs;
+this.activeTimeline = 
metaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants();
+  }
+
+  public abstract void createTable(String tableName, MessageType storageSchema,
+   String inputFormatClass, String 
outputFormatClass, String serdeClass);
+
+  public abstract boolean doesTableExist(String tableName);
+
+  public abstract Option getLastCommitTimeSynced(String tableName);
+
+  public abstract void updateLastCommitTimeSynced(String tableName);
+
+  public abstract void addPartitionsToTable(String tableName, List 
partitionsToAdd);
+
+  public abstract void updatePartitionsToTable(String tableName, List 
changedPartitions);
+
+  public abstract Map getTableSchema(String tableName);
+
+  public HoodieTimeline getActiveTimeline() {
+return activeTimeline;
+  }
+
+  public HoodieTableType getTableType() {
+return tableType;
+  }
+
+  public String getBasePath() {
+return metaClient.getBasePath();
+  }
+
+  public FileSystem getFs() {
+return fs;
+  }
+
+  public void closeQuietly(ResultSet resultSet, Statement stmt) {
+try {
+  if (stmt != null) {
+stmt.close();
+  }
+} catch (SQLException e) {
+  LOG.error("Could not close the statement opened ", e);

Review comment:
   let change to warn





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [hudi] leesf commented on a change in pull request #1810: [HUDI-875] Abstract hudi-sync-common, and support hudi-hive-sync

2020-08-04 Thread GitBox


leesf commented on a change in pull request #1810:
URL: https://github.com/apache/hudi/pull/1810#discussion_r465417053



##
File path: 
hudi-sync/hudi-sync-common/src/main/java/org/apache/hudi/sync/common/AbstractSyncHoodieClient.java
##
@@ -0,0 +1,189 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.sync.common;
+
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hudi.common.model.HoodieCommitMetadata;
+import org.apache.hudi.common.model.HoodieTableType;
+import org.apache.hudi.common.table.HoodieTableMetaClient;
+import org.apache.hudi.common.table.TableSchemaResolver;
+import org.apache.hudi.common.table.timeline.HoodieTimeline;
+import org.apache.hudi.common.table.timeline.HoodieInstant;
+import org.apache.hudi.common.fs.FSUtils;
+import org.apache.hudi.common.util.Option;
+import org.apache.hudi.exception.HoodieIOException;
+
+import org.apache.log4j.LogManager;
+import org.apache.log4j.Logger;
+import org.apache.parquet.schema.MessageType;
+
+import java.io.IOException;
+import java.sql.ResultSet;
+import java.sql.SQLException;
+import java.sql.Statement;
+import java.util.List;
+import java.util.Map;
+import java.util.stream.Collectors;
+
+public abstract class AbstractSyncHoodieClient {
+  private static final Logger LOG = 
LogManager.getLogger(AbstractSyncHoodieClient.class);
+  protected final HoodieTableMetaClient metaClient;
+  protected HoodieTimeline activeTimeline;
+  protected final HoodieTableType tableType;
+  protected final FileSystem fs;
+  private String basePath;
+  private boolean assumeDatePartitioning;
+
+  public AbstractSyncHoodieClient(String basePath, boolean 
assumeDatePartitioning, FileSystem fs) {
+this.metaClient = new HoodieTableMetaClient(fs.getConf(), basePath, true);
+this.tableType = metaClient.getTableType();
+this.basePath = basePath;
+this.assumeDatePartitioning = assumeDatePartitioning;
+this.fs = fs;
+this.activeTimeline = 
metaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants();
+  }
+
+  public abstract void createTable(String tableName, MessageType storageSchema,
+   String inputFormatClass, String 
outputFormatClass, String serdeClass);
+
+  public abstract boolean doesTableExist(String tableName);
+
+  public abstract Option getLastCommitTimeSynced(String tableName);
+
+  public abstract void updateLastCommitTimeSynced(String tableName);
+
+  public abstract void addPartitionsToTable(String tableName, List 
partitionsToAdd);
+
+  public abstract void updatePartitionsToTable(String tableName, List 
changedPartitions);
+
+  public abstract Map getTableSchema(String tableName);
+
+  public HoodieTimeline getActiveTimeline() {
+return activeTimeline;
+  }
+
+  public HoodieTableType getTableType() {
+return tableType;
+  }
+
+  public String getBasePath() {
+return metaClient.getBasePath();
+  }
+
+  public FileSystem getFs() {
+return fs;
+  }
+
+  public void closeQuietly(ResultSet resultSet, Statement stmt) {
+try {
+  if (stmt != null) {
+stmt.close();
+  }
+} catch (SQLException e) {
+  LOG.error("Could not close the statement opened ", e);

Review comment:
   please change to warn

##
File path: 
hudi-sync/hudi-sync-common/src/main/java/org/apache/hudi/sync/common/AbstractSyncHoodieClient.java
##
@@ -0,0 +1,189 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language 

[GitHub] [hudi] leesf commented on a change in pull request #1810: [HUDI-875] Abstract hudi-sync-common, and support hudi-hive-sync

2020-08-04 Thread GitBox


leesf commented on a change in pull request #1810:
URL: https://github.com/apache/hudi/pull/1810#discussion_r465416217



##
File path: 
hudi-sync/hudi-dla-sync/src/main/java/org/apache/hudi/dla/HoodieDLAClient.java
##
@@ -0,0 +1,403 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.dla;
+
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hudi.common.fs.FSUtils;
+import org.apache.hudi.common.util.Option;
+import org.apache.hudi.common.util.StringUtils;
+import org.apache.hudi.common.util.ValidationUtils;
+import org.apache.hudi.exception.HoodieException;
+import org.apache.hudi.hive.HiveSyncConfig;
+import org.apache.hudi.hive.HoodieHiveSyncException;
+import org.apache.hudi.hive.PartitionValueExtractor;
+import org.apache.hudi.hive.SchemaDifference;
+import org.apache.hudi.hive.util.HiveSchemaUtil;
+import org.apache.hudi.sync.common.AbstractSyncHoodieClient;
+import org.apache.log4j.LogManager;
+import org.apache.log4j.Logger;
+import org.apache.parquet.schema.MessageType;
+
+import java.io.IOException;
+import java.sql.Connection;
+import java.sql.DriverManager;
+import java.sql.DatabaseMetaData;
+import java.sql.ResultSet;
+import java.sql.SQLException;
+import java.sql.Statement;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.stream.Collectors;
+
+public class HoodieDLAClient extends AbstractSyncHoodieClient {
+  private static final Logger LOG = 
LogManager.getLogger(HoodieDLAClient.class);
+  private static final String HOODIE_LAST_COMMIT_TIME_SYNC = 
"hoodie_last_sync";
+  // Make sure we have the dla JDBC driver in classpath
+  private static final String DRIVER_NAME = "com.mysql.jdbc.Driver";
+  private static final String DLA_ESCAPE_CHARACTER = "";
+  private static final String TBL_PROPERTIES_STR = "TBLPROPERTIES";
+
+  static {
+try {
+  Class.forName(DRIVER_NAME);
+} catch (ClassNotFoundException e) {
+  throw new IllegalStateException("Could not find " + DRIVER_NAME + " in 
classpath. ", e);
+}
+  }
+
+  private Connection connection;
+  private DLASyncConfig dlaConfig;
+  private PartitionValueExtractor partitionValueExtractor;
+
+  public HoodieDLAClient(DLASyncConfig syncConfig, FileSystem fs) {
+super(syncConfig.basePath, syncConfig.assumeDatePartitioning, fs);
+this.dlaConfig = syncConfig;
+try {
+  this.partitionValueExtractor =
+  (PartitionValueExtractor) 
Class.forName(dlaConfig.partitionValueExtractorClass).newInstance();
+} catch (Exception e) {
+  throw new HoodieException(
+  "Failed to initialize PartitionValueExtractor class " + 
dlaConfig.partitionValueExtractorClass, e);
+}
+createDLAConnection();
+  }
+
+  private void createDLAConnection() {
+if (connection == null) {
+  try {
+Class.forName(DRIVER_NAME);
+  } catch (ClassNotFoundException e) {
+LOG.error("Unable to load DLA driver class", e);
+return;
+  }
+  try {
+this.connection = DriverManager.getConnection(dlaConfig.jdbcUrl, 
dlaConfig.dlaUser, dlaConfig.dlaPass);
+LOG.info("Successfully established DLA connection to  " + 
dlaConfig.jdbcUrl);
+  } catch (SQLException e) {
+throw new HoodieException("Cannot create dla connection ", e);
+  }
+}
+  }
+
+  @Override
+  public void createTable(String tableName, MessageType storageSchema, String 
inputFormatClass, String outputFormatClass, String serdeClass) {
+try {
+  String createSQLQuery = HiveSchemaUtil.generateCreateDDL(tableName, 
storageSchema, toHiveSyncConfig(), inputFormatClass, outputFormatClass, 
serdeClass);
+  LOG.info("Creating table with " + createSQLQuery);
+  updateDLASQL(createSQLQuery);
+} catch (IOException e) {
+  throw new HoodieException("Failed to create table " + tableName, e);
+}
+  }
+
+  public Map getTableSchema(String tableName) {
+if (!doesTableExist(tableName)) {
+  throw new IllegalArgumentException(
+  "Failed to get schema for table " + tableName + " does not exist");
+}
+Map schema = 

[GitHub] [hudi] leesf commented on a change in pull request #1810: [HUDI-875] Abstract hudi-sync-common, and support hudi-hive-sync

2020-08-04 Thread GitBox


leesf commented on a change in pull request #1810:
URL: https://github.com/apache/hudi/pull/1810#discussion_r465415868



##
File path: 
hudi-sync/hudi-dla-sync/src/main/java/org/apache/hudi/dla/DLASyncTool.java
##
@@ -0,0 +1,211 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.dla;
+
+import com.beust.jcommander.JCommander;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat;
+import org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe;
+import org.apache.hudi.common.fs.FSUtils;
+import org.apache.hudi.common.util.Option;
+import org.apache.hudi.dla.util.Utils;
+import org.apache.hudi.exception.HoodieException;
+import org.apache.hudi.exception.InvalidTableException;
+import org.apache.hudi.hadoop.HoodieParquetInputFormat;
+import org.apache.hudi.hadoop.realtime.HoodieParquetRealtimeInputFormat;
+import org.apache.hudi.hive.SchemaDifference;
+import org.apache.hudi.hive.util.HiveSchemaUtil;
+import org.apache.hudi.sync.common.AbstractSyncHoodieClient;
+import org.apache.hudi.sync.common.AbstractSyncTool;
+import org.apache.log4j.LogManager;
+import org.apache.log4j.Logger;
+import org.apache.parquet.schema.MessageType;
+
+import java.util.List;
+import java.util.Map;
+import java.util.Properties;
+import java.util.stream.Collectors;
+
+/**
+ * Tool to sync a hoodie table with a dla table. Either use it as a api
+ * DLASyncTool.syncHoodieTable(DLASyncConfig) or as a command line java -cp 
hoodie-hive.jar DLASyncTool [args]
+ * 
+ * This utility will get the schema from the latest commit and will sync dla 
table schema Also this will sync the
+ * partitions incrementally (all the partitions modified since the last commit)
+ */
+@SuppressWarnings("WeakerAccess")
+public class DLASyncTool extends AbstractSyncTool {
+
+  private static final Logger LOG = LogManager.getLogger(DLASyncTool.class);
+  public static final String SUFFIX_SNAPSHOT_TABLE = "_rt";
+  public static final String SUFFIX_READ_OPTIMIZED_TABLE = "_ro";
+
+  private final DLASyncConfig cfg;
+  private final HoodieDLAClient hoodieDLAClient;
+  private final String snapshotTableName;
+  private final Option roTableTableName;
+
+  public DLASyncTool(Properties properties, FileSystem fs) {
+super(properties, fs);
+this.hoodieDLAClient = new 
HoodieDLAClient(Utils.propertiesToConfig(properties), fs);
+this.cfg = Utils.propertiesToConfig(properties);
+switch (hoodieDLAClient.getTableType()) {
+  case COPY_ON_WRITE:
+this.snapshotTableName = cfg.tableName;
+this.roTableTableName = Option.empty();
+break;
+  case MERGE_ON_READ:
+this.snapshotTableName = cfg.tableName + SUFFIX_SNAPSHOT_TABLE;
+this.roTableTableName = cfg.skipROSuffix ? Option.of(cfg.tableName) :
+Option.of(cfg.tableName + SUFFIX_READ_OPTIMIZED_TABLE);
+break;
+  default:
+LOG.error("Unknown table type " + hoodieDLAClient.getTableType());
+throw new InvalidTableException(hoodieDLAClient.getBasePath());
+}
+  }
+
+  @Override
+  public void syncHoodieTable() {
+try {
+  switch (hoodieDLAClient.getTableType()) {
+case COPY_ON_WRITE:
+  syncHoodieTable(snapshotTableName, false);
+  break;
+case MERGE_ON_READ:
+  // sync a RO table for MOR
+  syncHoodieTable(roTableTableName.get(), false);
+  // sync a RT table for MOR
+  syncHoodieTable(snapshotTableName, true);
+  break;
+default:
+  LOG.error("Unknown table type " + hoodieDLAClient.getTableType());
+  throw new InvalidTableException(hoodieDLAClient.getBasePath());
+  }
+} catch (RuntimeException re) {
+  LOG.error("Got runtime exception when dla syncing", re);
+} finally {
+  hoodieDLAClient.close();
+}
+  }
+
+  private void syncHoodieTable(String tableName, boolean 
useRealtimeInputFormat) {
+LOG.info("Trying to sync hoodie table " + tableName + " with base path " + 
hoodieDLAClient.getBasePath()
++ " of type " + hoodieDLAClient.getTableType());
+// Check if the 

[jira] [Commented] (HUDI-1112) Blog on Tracking Hudi Data along transaction time and buisness time

2020-08-04 Thread Sandeep Maji (Jira)


[ 
https://issues.apache.org/jira/browse/HUDI-1112?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=17171204#comment-17171204
 ] 

Sandeep Maji commented on HUDI-1112:


Not able to assign it to myself.Please help

> Blog on Tracking Hudi Data along transaction time and buisness time
> ---
>
> Key: HUDI-1112
> URL: https://issues.apache.org/jira/browse/HUDI-1112
> Project: Apache Hudi
>  Issue Type: Task
>  Components: Docs
>Reporter: Vinoth Chandar
>Priority: Major
> Fix For: 0.6.0
>
>
> https://github.com/apache/hudi/issues/1705



--
This message was sent by Atlassian Jira
(v8.3.4#803005)


[GitHub] [hudi] nandini57 commented on issue #1705: Tracking Hudi Data along transaction time and buisness time

2020-08-04 Thread GitBox


nandini57 commented on issue #1705:
URL: https://github.com/apache/hudi/issues/1705#issuecomment-668908683


   Sure thankyou



This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [hudi] n3nash commented on pull request #1115: [HUDI-392] Introduce DIstributedTestDataSource to generate test data

2020-08-04 Thread GitBox


n3nash commented on pull request #1115:
URL: https://github.com/apache/hudi/pull/1115#issuecomment-668905276


   @yanghua Is it okay to close this now ?



This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [hudi] n3nash edited a comment on pull request #1242: [HUDI-544] Archived commits command code cleanup

2020-08-04 Thread GitBox


n3nash edited a comment on pull request #1242:
URL: https://github.com/apache/hudi/pull/1242#issuecomment-668905083


   @hddong Sorry this fell through, please rebase to resolve conflicts and I 
will merge this asap



This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[jira] [Updated] (HUDI-1085) Support specifies archived path in HoodieSparkSqlWriter

2020-08-04 Thread Nishith Agarwal (Jira)


 [ 
https://issues.apache.org/jira/browse/HUDI-1085?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Nishith Agarwal updated HUDI-1085:
--
Priority: Blocker  (was: Major)

> Support specifies archived path in HoodieSparkSqlWriter
> ---
>
> Key: HUDI-1085
> URL: https://issues.apache.org/jira/browse/HUDI-1085
> Project: Apache Hudi
>  Issue Type: Task
>  Components: Docs, Release  Administrative
>Reporter: hong dongdong
>Assignee: hong dongdong
>Priority: Blocker
> Fix For: 0.6.0
>
>
> The default archived path is '' before. Now, we can specifies archived path 
> by overwrite conf 
> "hoodie.archivelog.folder"(HoodieTableConfig.HOODIE_ARCHIVELOG_FOLDER_PROP_NAME).
>  



--
This message was sent by Atlassian Jira
(v8.3.4#803005)


[GitHub] [hudi] n3nash commented on pull request #1242: [HUDI-544] Archived commits command code cleanup

2020-08-04 Thread GitBox


n3nash commented on pull request #1242:
URL: https://github.com/apache/hudi/pull/1242#issuecomment-668905083


   @hddong Sorry this fell through, please rebase and I will merge this asap



This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[jira] [Updated] (HUDI-1085) Support specifies archived path in HoodieSparkSqlWriter

2020-08-04 Thread Nishith Agarwal (Jira)


 [ 
https://issues.apache.org/jira/browse/HUDI-1085?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Nishith Agarwal updated HUDI-1085:
--
Fix Version/s: 0.6.0

> Support specifies archived path in HoodieSparkSqlWriter
> ---
>
> Key: HUDI-1085
> URL: https://issues.apache.org/jira/browse/HUDI-1085
> Project: Apache Hudi
>  Issue Type: Task
>  Components: Docs, Release  Administrative
>Reporter: hong dongdong
>Assignee: hong dongdong
>Priority: Major
> Fix For: 0.6.0
>
>
> The default archived path is '' before. Now, we can specifies archived path 
> by overwrite conf 
> "hoodie.archivelog.folder"(HoodieTableConfig.HOODIE_ARCHIVELOG_FOLDER_PROP_NAME).
>  



--
This message was sent by Atlassian Jira
(v8.3.4#803005)


[hudi] branch master updated (ab11ba4 -> d3711a2)

2020-08-04 Thread nagarwal
This is an automated email from the ASF dual-hosted git repository.

nagarwal pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/hudi.git.


from ab11ba4  [REVERT] "[HUDI-1058] Make delete marker configurable 
(#1819)" (#1914)
 add d3711a2  [HUDI-525] lack of insert info in delta_commit inflight

No new revisions were added by this update.

Summary of changes:
 .../apache/hudi/table/action/commit/BaseCommitActionExecutor.java   | 6 ++
 1 file changed, 6 insertions(+)



[GitHub] [hudi] n3nash merged pull request #1509: [HUDI-525] lack of insert info in delta_commit inflight

2020-08-04 Thread GitBox


n3nash merged pull request #1509:
URL: https://github.com/apache/hudi/pull/1509


   



This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [hudi] bvaradar opened a new pull request #1917: [WIP] Copy of PR 1752 to debug CI failure. Not for merging

2020-08-04 Thread GitBox


bvaradar opened a new pull request #1917:
URL: https://github.com/apache/hudi/pull/1917


   



This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [hudi] n3nash commented on pull request #1509: [HUDI-525] lack of insert info in delta_commit inflight

2020-08-04 Thread GitBox


n3nash commented on pull request #1509:
URL: https://github.com/apache/hudi/pull/1509#issuecomment-668903770


   LGTM



This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [hudi] n3nash commented on a change in pull request #1858: [HUDI-1014] Adding Upgrade and downgrade infra for smooth transitioning from list based rollback to marker based rollback

2020-08-04 Thread GitBox


n3nash commented on a change in pull request #1858:
URL: https://github.com/apache/hudi/pull/1858#discussion_r465405658



##
File path: 
hudi-client/src/main/java/org/apache/hudi/table/upgrade/UpgradeDowngradeUtil.java
##
@@ -0,0 +1,143 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.table.upgrade;
+
+import org.apache.hudi.common.table.HoodieTableConfig;
+import org.apache.hudi.common.table.HoodieTableMetaClient;
+import org.apache.hudi.common.table.HoodieTableVersion;
+import org.apache.hudi.config.HoodieWriteConfig;
+import org.apache.hudi.exception.HoodieException;
+
+import org.apache.hadoop.fs.FileUtil;
+import org.apache.hadoop.fs.Path;
+import org.apache.log4j.LogManager;
+import org.apache.log4j.Logger;
+import org.apache.spark.api.java.JavaSparkContext;
+
+import java.io.IOException;
+
+/**
+ * Helper class to assist in upgrading/downgrading Hoodie when there is a 
version change.
+ */
+public class UpgradeDowngradeUtil {

Review comment:
   @nsivabalan Can a user control the hoodie layout version manually from 
the HoodieWriteConfig. Say, choose the older timeline layout for 0.6.0 in which 
case there is no need to upgrade ?





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [hudi] n3nash commented on a change in pull request #1915: [HUDI-1149] Added a console metrics reporter and associated unit tests.

2020-08-04 Thread GitBox


n3nash commented on a change in pull request #1915:
URL: https://github.com/apache/hudi/pull/1915#discussion_r465401588



##
File path: 
hudi-client/src/main/java/org/apache/hudi/metrics/ConsoleMetricsReporter.java
##
@@ -0,0 +1,72 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.metrics;
+
+import java.io.Closeable;
+import java.util.concurrent.TimeUnit;
+
+import org.apache.log4j.LogManager;
+import org.apache.log4j.Logger;
+
+import com.codahale.metrics.ConsoleReporter;
+import com.codahale.metrics.MetricFilter;
+import com.codahale.metrics.MetricRegistry;
+
+/**
+ * Used for testing.

Review comment:
   Can you add some more java docs ? Look at DataDog metrics reporter for 
example





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[jira] [Updated] (HUDI-1025) Meter RPC calls in HoodieWrapperFileSystem

2020-08-04 Thread ASF GitHub Bot (Jira)


 [ 
https://issues.apache.org/jira/browse/HUDI-1025?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

ASF GitHub Bot updated HUDI-1025:
-
Labels: pull-request-available  (was: )

> Meter RPC calls in HoodieWrapperFileSystem
> --
>
> Key: HUDI-1025
> URL: https://issues.apache.org/jira/browse/HUDI-1025
> Project: Apache Hudi
>  Issue Type: Improvement
>  Components: Common Core
>Reporter: Abhishek Modi
>Assignee: Abhishek Modi
>Priority: Minor
>  Labels: pull-request-available
>
> Hudi issues very a large number of RPC calls to DFS. When making changes to 
> Hudi, we try to ensure that the number of RPC calls does not increase 
> appreciably, as this could impact the DFS. 
> We should therefore meter HoodieWrapperFileSystem so that we can track the 
> RPC calls. This will help in service observability / SLA tracking and will 
> make it easier to tell when change results in increased RPC load. 



--
This message was sent by Atlassian Jira
(v8.3.4#803005)


[GitHub] [hudi] modi95 opened a new pull request #1916: [HUDI-1025] Meter RPC calls in HoodieWrapperFileSystem

2020-08-04 Thread GitBox


modi95 opened a new pull request #1916:
URL: https://github.com/apache/hudi/pull/1916


   ##  What is the purpose of the pull request
   
   This diff adds a lightweight metrics registry (based on codahale metrics) 
for Hudi Common. Other modules can use this registry to track their metrics as 
well. The Hudi Client has been modified to collect and emit the metrics from 
this registry.
   
   
   ## Verify this pull request
   
   This change added tests and can be verified as follows:
   
 - Added unit tests in `org.apache.hudi.common.TestRegistry`. 
 - Verified change using console logger metrics in Hudi Demo Docker.
 - These changes have been tested in production at Uber. 
   
   ## Committer checklist
   
- [ ] Has a corresponding JIRA in PR title & commit

- [ ] Commit message is descriptive of the change

- [ ] CI is green
   
- [ ] Necessary doc changes done or have another open PR
  
- [ ] For large changes, please consider breaking it into sub-tasks under 
an umbrella JIRA.



This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [hudi] prashantwason opened a new pull request #1915: [HUDI-1149] Added a console metrics reporter and associated unit tests.

2020-08-04 Thread GitBox


prashantwason opened a new pull request #1915:
URL: https://github.com/apache/hudi/pull/1915


   ## *Tips*
   - *Thank you very much for contributing to Apache Hudi.*
   - *Please review https://hudi.apache.org/contributing.html before opening a 
pull request.*
   
   ## What is the purpose of the pull request
   
   Added a new metric reporter which prints the metrics on the console.
   
   ## Brief change log
   
   Added a new metric reporter which prints the metrics on the console.
   
   ## Verify this pull request
   
   Added a unit test.
   
   ## Committer checklist
   
- [ ] Has a corresponding JIRA in PR title & commit

- [ ] Commit message is descriptive of the change

- [ ] CI is green
   
- [ ] Necessary doc changes done or have another open PR
  
- [ ] For large changes, please consider breaking it into sub-tasks under 
an umbrella JIRA.



This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[jira] [Updated] (HUDI-1149) Add a Console Metrics Reporter

2020-08-04 Thread ASF GitHub Bot (Jira)


 [ 
https://issues.apache.org/jira/browse/HUDI-1149?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

ASF GitHub Bot updated HUDI-1149:
-
Labels: pull-request-available  (was: )

> Add a Console Metrics Reporter
> --
>
> Key: HUDI-1149
> URL: https://issues.apache.org/jira/browse/HUDI-1149
> Project: Apache Hudi
>  Issue Type: Improvement
>Reporter: Prashant Wason
>Priority: Minor
>  Labels: pull-request-available
>
> A metrics reporter which dumps the metrics to the console. This is useful for 
> unit test and for testing on the docker based suite.



--
This message was sent by Atlassian Jira
(v8.3.4#803005)


[jira] [Created] (HUDI-1149) Add a Console Metrics Reporter

2020-08-04 Thread Prashant Wason (Jira)
Prashant Wason created HUDI-1149:


 Summary: Add a Console Metrics Reporter
 Key: HUDI-1149
 URL: https://issues.apache.org/jira/browse/HUDI-1149
 Project: Apache Hudi
  Issue Type: Improvement
Reporter: Prashant Wason


A metrics reporter which dumps the metrics to the console. This is useful for 
unit test and for testing on the docker based suite.



--
This message was sent by Atlassian Jira
(v8.3.4#803005)


[GitHub] [hudi] umehrot2 commented on pull request #1702: [HUDI-426] Bootstrap datasource integration

2020-08-04 Thread GitBox


umehrot2 commented on pull request #1702:
URL: https://github.com/apache/hudi/pull/1702#issuecomment-668860202


   > @umehrot2 heads up, we could be landing #1848 before this. (CI willing). 
How hard would the rebase be. I assume there would be some extra work to 
integrate?
   
   I guess one of us will have to re-base. While most of the work seems 
isolated between the two PRs, but some files are common and common code areas 
have been touched. I am fine with doing further re-base if that PR gets in 
first.



This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [hudi] umehrot2 commented on a change in pull request #1702: [HUDI-426] Bootstrap datasource integration

2020-08-04 Thread GitBox


umehrot2 commented on a change in pull request #1702:
URL: https://github.com/apache/hudi/pull/1702#discussion_r465367893



##
File path: hudi-spark/src/main/scala/org/apache/hudi/HudiSparkUtils.scala
##
@@ -0,0 +1,50 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi
+
+import org.apache.hadoop.fs.{FileSystem, Path}
+import org.apache.hudi.common.model.HoodieRecord
+import org.apache.spark.deploy.SparkHadoopUtil
+import org.apache.spark.sql.SparkSession
+import org.apache.spark.sql.execution.datasources.{FileStatusCache, 
InMemoryFileIndex}
+import org.apache.spark.sql.types.{StringType, StructField, StructType}
+import scala.collection.JavaConverters._
+
+
+object HudiSparkUtils {
+
+  def getHudiMetadataSchema: StructType = {
+StructType(HoodieRecord.HOODIE_META_COLUMNS.asScala.map(col => {
+StructField(col, StringType, nullable = true)
+}))
+  }
+
+  def checkAndGlobPathIfNecessary(paths: Seq[String], fs: FileSystem): 
Seq[Path] = {
+paths.flatMap(path => {
+  val qualified = new Path(path).makeQualified(fs.getUri, 
fs.getWorkingDirectory)
+  val globPaths = SparkHadoopUtil.get.globPathIfNecessary(fs, qualified)
+  globPaths
+})
+  }
+
+  def createInMemoryFileIndex(sparkSession: SparkSession, globbedPaths: 
Seq[Path]): InMemoryFileIndex = {

Review comment:
   The common useful utilities have been contributed as part of 
https://github.com/apache/hudi/pull/1841





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [hudi] umehrot2 commented on a change in pull request #1702: [HUDI-426] Bootstrap datasource integration

2020-08-04 Thread GitBox


umehrot2 commented on a change in pull request #1702:
URL: https://github.com/apache/hudi/pull/1702#discussion_r465367609



##
File path: hudi-client/pom.xml
##
@@ -101,6 +101,11 @@
   org.apache.spark
   spark-sql_${scala.binary.version}
 
+
+  org.apache.spark
+  spark-avro_${scala.binary.version}

Review comment:
   This class is used in `hudi-client` inside 
`BootstrapCommitActionExecutor`. So we cannot move it to `hudi-spark`.





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [hudi] vinothchandar removed a comment on pull request #1912: [HUDI-1098] Adding TimedWaitOnAppearConsistencyGuard

2020-08-04 Thread GitBox


vinothchandar removed a comment on pull request #1912:
URL: https://github.com/apache/hudi/pull/1912#issuecomment-668853480


   @umehrot2 can you please review this? 



This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [hudi] vinothchandar commented on pull request #1912: [HUDI-1098] Adding TimedWaitOnAppearConsistencyGuard

2020-08-04 Thread GitBox


vinothchandar commented on pull request #1912:
URL: https://github.com/apache/hudi/pull/1912#issuecomment-668853480


   @umehrot2 can you please review this? 



This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[hudi] branch master updated: [REVERT] "[HUDI-1058] Make delete marker configurable (#1819)" (#1914)

2020-08-04 Thread vinoth
This is an automated email from the ASF dual-hosted git repository.

vinoth pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hudi.git


The following commit(s) were added to refs/heads/master by this push:
 new ab11ba4  [REVERT] "[HUDI-1058] Make delete marker configurable 
(#1819)" (#1914)
ab11ba4 is described below

commit ab11ba43e1a5496cf85a7a772929bb90fcbf07d3
Author: Sivabalan Narayanan 
AuthorDate: Tue Aug 4 18:20:38 2020 -0400

[REVERT] "[HUDI-1058] Make delete marker configurable (#1819)" (#1914)

This reverts commit 433d7d2c9886fed161557efe88b62ebdce0fe5df.
---
 .../org/apache/hudi/config/HoodieWriteConfig.java  |  9 --
 .../model/OverwriteWithLatestAvroPayload.java  | 10 +--
 .../model/TestOverwriteWithLatestAvroPayload.java  | 67 +--
 .../common/testutils/HoodieTestDataGenerator.java  | 27 ++
 .../main/java/org/apache/hudi/DataSourceUtils.java | 23 ++---
 .../SparkParquetBootstrapDataProvider.java |  2 +-
 .../scala/org/apache/hudi/DataSourceOptions.scala  |  7 --
 .../org/apache/hudi/HoodieSparkSqlWriter.scala |  5 +-
 .../functional/HoodieSparkSqlWriterSuite.scala | 46 --
 .../hudi/utilities/deltastreamer/DeltaSync.java|  6 +-
 ...eltaStreamerWithOverwriteLatestAvroPayload.java | 97 --
 .../resources/delta-streamer-config/source.avsc|  4 -
 .../sql-transformer.properties |  2 +-
 .../resources/delta-streamer-config/target.avsc|  4 -
 14 files changed, 43 insertions(+), 266 deletions(-)

diff --git 
a/hudi-client/src/main/java/org/apache/hudi/config/HoodieWriteConfig.java 
b/hudi-client/src/main/java/org/apache/hudi/config/HoodieWriteConfig.java
index affe553..80bc17e 100644
--- a/hudi-client/src/main/java/org/apache/hudi/config/HoodieWriteConfig.java
+++ b/hudi-client/src/main/java/org/apache/hudi/config/HoodieWriteConfig.java
@@ -94,9 +94,6 @@ public class HoodieWriteConfig extends DefaultHoodieConfig {
   public static final String BULKINSERT_SORT_MODE = 
"hoodie.bulkinsert.sort.mode";
   public static final String DEFAULT_BULKINSERT_SORT_MODE = 
BulkInsertSortMode.GLOBAL_SORT
   .toString();
-  public static final String DELETE_MARKER_FIELD_PROP = 
"hoodie.write.delete.marker.field";
-  public static final String DEFAULT_DELETE_MARKER_FIELD = 
"_hoodie_is_deleted";
-
 
   public static final String EMBEDDED_TIMELINE_SERVER_ENABLED = 
"hoodie.embed.timeline.server";
   public static final String DEFAULT_EMBEDDED_TIMELINE_SERVER_ENABLED = "true";
@@ -277,10 +274,6 @@ public class HoodieWriteConfig extends DefaultHoodieConfig 
{
 return BulkInsertSortMode.valueOf(sortMode.toUpperCase());
   }
 
-  public String getDeleteMarkerField() {
-return props.getProperty(DELETE_MARKER_FIELD_PROP);
-  }
-
   /**
* compaction properties.
*/
@@ -964,8 +957,6 @@ public class HoodieWriteConfig extends DefaultHoodieConfig {
   setDefaultOnCondition(props, !props.containsKey(AVRO_SCHEMA_VALIDATE), 
AVRO_SCHEMA_VALIDATE, DEFAULT_AVRO_SCHEMA_VALIDATE);
   setDefaultOnCondition(props, !props.containsKey(BULKINSERT_SORT_MODE),
   BULKINSERT_SORT_MODE, DEFAULT_BULKINSERT_SORT_MODE);
-  setDefaultOnCondition(props, 
!props.containsKey(DELETE_MARKER_FIELD_PROP),
-  DELETE_MARKER_FIELD_PROP, DEFAULT_DELETE_MARKER_FIELD);
 
   // Make sure the props is propagated
   setDefaultOnCondition(props, !isIndexConfigSet, 
HoodieIndexConfig.newBuilder().fromProperties(props).build());
diff --git 
a/hudi-common/src/main/java/org/apache/hudi/common/model/OverwriteWithLatestAvroPayload.java
 
b/hudi-common/src/main/java/org/apache/hudi/common/model/OverwriteWithLatestAvroPayload.java
index 0e4b18a..d8dffdf 100644
--- 
a/hudi-common/src/main/java/org/apache/hudi/common/model/OverwriteWithLatestAvroPayload.java
+++ 
b/hudi-common/src/main/java/org/apache/hudi/common/model/OverwriteWithLatestAvroPayload.java
@@ -36,8 +36,6 @@ import java.io.IOException;
 public class OverwriteWithLatestAvroPayload extends BaseAvroPayload
 implements HoodieRecordPayload {
 
-  private String deleteMarkerField = "_hoodie_is_deleted";
-
   /**
*
*/
@@ -49,12 +47,6 @@ public class OverwriteWithLatestAvroPayload extends 
BaseAvroPayload
 this(record.isPresent() ? record.get() : null, (record1) -> 0); // natural 
order
   }
 
-  public OverwriteWithLatestAvroPayload(GenericRecord record, Comparable 
orderingVal,
-String deleteMarkerField) {
-this(record, orderingVal);
-this.deleteMarkerField = deleteMarkerField;
-  }
-
   @Override
   public OverwriteWithLatestAvroPayload 
preCombine(OverwriteWithLatestAvroPayload another) {
 // pick the payload with greatest ordering value
@@ -88,7 +80,7 @@ public class OverwriteWithLatestAvroPayload extends 
BaseAvroPayload
* @returns {@code true} if record represents a delete record. {@code false} 
otherwise.
*/
   private boolean 

[GitHub] [hudi] vinothchandar merged pull request #1914: Revert "[HUDI-1058] Make delete marker configurable (#1819)"

2020-08-04 Thread GitBox


vinothchandar merged pull request #1914:
URL: https://github.com/apache/hudi/pull/1914


   



This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [hudi] vinothchandar commented on pull request #1702: [HUDI-426] Bootstrap datasource integration

2020-08-04 Thread GitBox


vinothchandar commented on pull request #1702:
URL: https://github.com/apache/hudi/pull/1702#issuecomment-668852717


   @umehrot2 heads up, we could be landing #1848 before this. (CI willing).  
How hard would the rebase be. I assume there would be some extra work to 
integrate? 



This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [hudi] umehrot2 commented on a change in pull request #1702: [HUDI-426] Bootstrap datasource integration

2020-08-04 Thread GitBox


umehrot2 commented on a change in pull request #1702:
URL: https://github.com/apache/hudi/pull/1702#discussion_r465354934



##
File path: hudi-spark/src/main/scala/org/apache/hudi/HudiBootstrapRelation.scala
##
@@ -0,0 +1,185 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi
+
+import org.apache.hadoop.fs.Path
+import org.apache.hudi.common.model.HoodieBaseFile
+import org.apache.hudi.common.table.{HoodieTableMetaClient, 
TableSchemaResolver}
+import org.apache.hudi.common.table.view.HoodieTableFileSystemView
+import org.apache.hudi.exception.HoodieException
+import org.apache.spark.internal.Logging
+import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.execution.datasources.PartitionedFile
+import org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat
+import org.apache.spark.sql.{Row, SQLContext}
+import org.apache.spark.sql.sources.{BaseRelation, Filter, PrunedFilteredScan}
+import org.apache.spark.sql.types.StructType
+
+import scala.collection.JavaConverters._
+
+/**
+  * This is Spark relation that can be used for querying metadata/fully 
bootstrapped query hudi tables, as well as
+  * non-bootstrapped tables. It implements PrunedFilteredScan interface in 
order to support column pruning and filter
+  * push-down. For metadata bootstrapped files, if we query columns from both 
metadata and actual data then it will
+  * perform a merge of both to return the result.
+  *
+  * Caveat: Filter push-down does not work when querying both metadata and 
actual data columns over metadata
+  * bootstrapped files, because then the metadata file and data file can 
return different number of rows causing errors
+  * merging.
+  *
+  * @param _sqlContext Spark SQL Context
+  * @param userSchema User specified schema in the datasource query
+  * @param globPaths Globbed paths obtained from the user provided path for 
querying
+  * @param metaClient Hudi table meta client
+  * @param optParams DataSource options passed by the user
+  */
+class HudiBootstrapRelation(@transient val _sqlContext: SQLContext,
+val userSchema: StructType,
+val globPaths: Seq[Path],
+val metaClient: HoodieTableMetaClient,
+val optParams: Map[String, String]) extends 
BaseRelation
+  with PrunedFilteredScan with Logging {
+
+  val skeletonSchema: StructType = HudiSparkUtils.getHudiMetadataSchema
+  var dataSchema: StructType = _
+  var fullSchema: StructType = _
+
+  val fileIndex: HudiBootstrapFileIndex = buildFileIndex()
+
+  override def sqlContext: SQLContext = _sqlContext
+
+  override val needConversion: Boolean = false
+
+  override def schema: StructType = inferFullSchema()
+
+  override def buildScan(requiredColumns: Array[String], filters: 
Array[Filter]): RDD[Row] = {
+logInfo("Starting scan..")
+
+// Compute splits
+val bootstrapSplits = fileIndex.files.map(hoodieBaseFile => {
+  var skeletonFile: Option[PartitionedFile] = Option.empty
+  var dataFile: PartitionedFile = null
+
+  if (hoodieBaseFile.getExternalBaseFile.isPresent) {
+skeletonFile = Option(PartitionedFile(InternalRow.empty, 
hoodieBaseFile.getPath, 0, hoodieBaseFile.getFileLen))
+dataFile = PartitionedFile(InternalRow.empty, 
hoodieBaseFile.getExternalBaseFile.get().getPath, 0,
+  hoodieBaseFile.getExternalBaseFile.get().getFileLen)
+  } else {
+dataFile = PartitionedFile(InternalRow.empty, hoodieBaseFile.getPath, 
0, hoodieBaseFile.getFileLen)
+  }
+  HudiBootstrapSplit(dataFile, skeletonFile)
+})
+val tableState = HudiBootstrapTableState(bootstrapSplits)
+
+// Get required schemas for column pruning
+var requiredDataSchema = StructType(Seq())
+var requiredSkeletonSchema = StructType(Seq())
+requiredColumns.foreach(col => {
+  var field = dataSchema.find(_.name == col)
+  if (field.isDefined) {
+requiredDataSchema = requiredDataSchema.add(field.get)
+  } else {
+field = skeletonSchema.find(_.name == col)
+requiredSkeletonSchema = 

[GitHub] [hudi] umehrot2 commented on a change in pull request #1702: [HUDI-426] Bootstrap datasource integration

2020-08-04 Thread GitBox


umehrot2 commented on a change in pull request #1702:
URL: https://github.com/apache/hudi/pull/1702#discussion_r465353608



##
File path: hudi-spark/src/main/scala/org/apache/hudi/DefaultSource.scala
##
@@ -54,29 +58,54 @@ class DefaultSource extends RelationProvider
 val parameters = Map(QUERY_TYPE_OPT_KEY -> DEFAULT_QUERY_TYPE_OPT_VAL) ++ 
translateViewTypesToQueryTypes(optParams)
 
 val path = parameters.get("path")
-if (path.isEmpty) {
-  throw new HoodieException("'path' must be specified.")
-}
 
 if (parameters(QUERY_TYPE_OPT_KEY).equals(QUERY_TYPE_SNAPSHOT_OPT_VAL)) {
-  // this is just effectively RO view only, where `path` can contain a mix 
of
-  // non-hoodie/hoodie path files. set the path filter up
-  sqlContext.sparkContext.hadoopConfiguration.setClass(
-"mapreduce.input.pathFilter.class",
-classOf[HoodieROTablePathFilter],
-classOf[org.apache.hadoop.fs.PathFilter])
-
-  log.info("Constructing hoodie (as parquet) data source with options :" + 
parameters)
-  log.warn("Snapshot view not supported yet via data source, for 
MERGE_ON_READ tables. " +
-"Please query the Hive table registered using Spark SQL.")
-  // simply return as a regular parquet relation
-  DataSource.apply(
-sparkSession = sqlContext.sparkSession,
-userSpecifiedSchema = Option(schema),
-className = "parquet",
-options = parameters)
-.resolveRelation()
+  val readPathsStr = 
parameters.get(DataSourceReadOptions.READ_PATHS_OPT_KEY)

Review comment:
   These additional paths are being used in the **Incremental query** code 
to make it work for bootstrapped tables. I need to pass a list of bootstrapped 
files to read, and that is why had to add support for reading from multiple 
paths. `spark.read.parquet` already has that kind of support and is being used 
in **incremental relation** already to read a list of files.





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [hudi] vinothchandar commented on a change in pull request #1702: [HUDI-426] Bootstrap datasource integration

2020-08-04 Thread GitBox


vinothchandar commented on a change in pull request #1702:
URL: https://github.com/apache/hudi/pull/1702#discussion_r465353210



##
File path: hudi-client/pom.xml
##
@@ -101,6 +101,11 @@
   org.apache.spark
   spark-sql_${scala.binary.version}
 
+
+  org.apache.spark
+  spark-avro_${scala.binary.version}

Review comment:
   >That is why I had to introduce spark-avro in hudi-client. If you agree 
with the above suggestion, and do not want spark-avro to be added to 
hudi-client then I would suggest moving this class to hudi-spark.
   
   If that class is support to work with Spark Datasource only, yes, lets move 
it to hudi-spark 





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [hudi] umehrot2 commented on a change in pull request #1702: [HUDI-426] Bootstrap datasource integration

2020-08-04 Thread GitBox


umehrot2 commented on a change in pull request #1702:
URL: https://github.com/apache/hudi/pull/1702#discussion_r465352046



##
File path: hudi-client/pom.xml
##
@@ -101,6 +101,11 @@
   org.apache.spark
   spark-sql_${scala.binary.version}
 
+
+  org.apache.spark
+  spark-avro_${scala.binary.version}

Review comment:
   I have explained the reason here 
https://github.com/apache/hudi/pull/1876/#discussion_r463889083 Let me know 
your thoughts.





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [hudi] umehrot2 commented on a change in pull request #1702: [HUDI-426] Bootstrap datasource integration

2020-08-04 Thread GitBox


umehrot2 commented on a change in pull request #1702:
URL: https://github.com/apache/hudi/pull/1702#discussion_r465352046



##
File path: hudi-client/pom.xml
##
@@ -101,6 +101,11 @@
   org.apache.spark
   spark-sql_${scala.binary.version}
 
+
+  org.apache.spark
+  spark-avro_${scala.binary.version}

Review comment:
   I have explained the reason here => 
https://github.com/apache/hudi/pull/1876/#discussion_r463889083 Let me know 
your thoughts.





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [hudi] nsivabalan commented on pull request #1819: [HUDI-1058] Make delete marker configurable

2020-08-04 Thread GitBox


nsivabalan commented on pull request #1819:
URL: https://github.com/apache/hudi/pull/1819#issuecomment-668833066


   I see it now. got it. 



This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [hudi] vinothchandar commented on pull request #1914: Revert "[HUDI-1058] Make delete marker configurable (#1819)"

2020-08-04 Thread GitBox


vinothchandar commented on pull request #1914:
URL: https://github.com/apache/hudi/pull/1914#issuecomment-668832594


   @nsivabalan lets also reopen the JIRA?



This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [hudi] nsivabalan opened a new pull request #1914: Revert "[HUDI-1058] Make delete marker configurable (#1819)"

2020-08-04 Thread GitBox


nsivabalan opened a new pull request #1914:
URL: https://github.com/apache/hudi/pull/1914


   This reverts commit 433d7d2c9886fed161557efe88b62ebdce0fe5df.
   
   ## What is the purpose of the pull request
   
   Reverting commit for configurable delete marker due to performance concerns 
in existing path. Will rework and put up a patch later. 
   
   ## Verify this pull request
   
   This pull request is reverting a previous commit. 
   
   ## Committer checklist
   
- [ ] Has a corresponding JIRA in PR title & commit

- [ ] Commit message is descriptive of the change

- [ ] CI is green
   
- [ ] Necessary doc changes done or have another open PR
  
- [ ] For large changes, please consider breaking it into sub-tasks under 
an umbrella JIRA.



This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [hudi] vinothchandar commented on pull request #1819: [HUDI-1058] Make delete marker configurable

2020-08-04 Thread GitBox


vinothchandar commented on pull request #1819:
URL: https://github.com/apache/hudi/pull/1819#issuecomment-668831250


   @nsivabalan my concern is more on the overhead of the extra field on the 
payload serialization. No matter what constructor is called. there is a string 
field that will be serialized, right?



This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [hudi] nsivabalan commented on pull request #1819: [HUDI-1058] Make delete marker configurable

2020-08-04 Thread GitBox


nsivabalan commented on pull request #1819:
URL: https://github.com/apache/hudi/pull/1819#issuecomment-668830077


   @vinothchandar : if I am not wrong, we added an additional overloaded 
constructor to OverwriteWithLatestAvroPayload. so shouldn't have broken any 
existing implementations. Only if someone wants to leverage user defined col, 
they might have to use the new constructor. Anyways, your perf reasoning is 
convincing, hence going ahead w/ reverting. 



This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [hudi] nsivabalan commented on pull request #1819: [HUDI-1058] Make delete marker configurable

2020-08-04 Thread GitBox


nsivabalan commented on pull request #1819:
URL: https://github.com/apache/hudi/pull/1819#issuecomment-668829234


   @shenh062326 : appreciate your help. We are looking to have a release by 
this weekend and so I am reverting this patch for now. I will work with you 
with the right fix for configurable delete marker. if we can get it in by this 
weekend well and good, even if not, we can land it later once we cut the 
release for 0.6.0. 



This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [hudi] bvaradar commented on a change in pull request #1752: [HUDI-575] Support Async Compaction for spark streaming writes to hudi table

2020-08-04 Thread GitBox


bvaradar commented on a change in pull request #1752:
URL: https://github.com/apache/hudi/pull/1752#discussion_r465333562



##
File path: 
hudi-client/src/main/java/org/apache/hudi/async/AsyncCompactService.java
##
@@ -0,0 +1,163 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.async;
+
+import org.apache.hudi.client.Compactor;
+import org.apache.hudi.client.HoodieWriteClient;
+import org.apache.hudi.common.table.timeline.HoodieInstant;
+import org.apache.hudi.common.util.collection.Pair;
+import org.apache.hudi.exception.HoodieIOException;
+import org.apache.log4j.LogManager;
+import org.apache.log4j.Logger;
+import org.apache.spark.api.java.JavaSparkContext;
+
+import java.io.IOException;
+import java.util.concurrent.BlockingQueue;
+import java.util.concurrent.CompletableFuture;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+import java.util.concurrent.LinkedBlockingQueue;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.locks.Condition;
+import java.util.concurrent.locks.ReentrantLock;
+import java.util.stream.IntStream;
+
+/**
+ * Async Compactor Service that runs in separate thread. Currently, only one 
compactor is allowed to run at any time.
+ */
+public class AsyncCompactService extends AbstractAsyncService {
+
+  private static final long serialVersionUID = 1L;
+  private static final Logger LOG = 
LogManager.getLogger(AsyncCompactService.class);
+
+  /**
+   * This is the job pool used by async compaction.
+   * In case of deltastreamer, Spark job scheduling configs are automatically 
set.
+   * As the configs needs to be set before spark context is initiated, it is 
not
+   * automated for Structured Streaming.
+   * https://spark.apache.org/docs/latest/job-scheduling.html
+   */
+  public static final String COMPACT_POOL_NAME = "hoodiecompact";
+
+  private final int maxConcurrentCompaction;
+  private transient Compactor compactor;
+  private transient JavaSparkContext jssc;
+  private transient BlockingQueue pendingCompactions = new 
LinkedBlockingQueue<>();
+  private transient ReentrantLock queueLock = new ReentrantLock();
+  private transient Condition consumed = queueLock.newCondition();
+
+  public AsyncCompactService(JavaSparkContext jssc, HoodieWriteClient client) {
+this.jssc = jssc;
+this.compactor = new Compactor(client, jssc);
+this.maxConcurrentCompaction = 1;
+  }
+
+  /**
+   * Enqueues new Pending compaction.
+   */
+  public void enqueuePendingCompaction(HoodieInstant instant) {
+pendingCompactions.add(instant);
+  }
+
+  /**
+   * Wait till outstanding pending compactions reduces to the passed in value.
+   *
+   * @param numPendingCompactions Maximum pending compactions allowed
+   * @throws InterruptedException
+   */
+  public void waitTillPendingCompactionsReducesTo(int numPendingCompactions) 
throws InterruptedException {
+try {
+  queueLock.lock();
+  while (!isShutdown() && (pendingCompactions.size() > 
numPendingCompactions)) {
+consumed.await();
+  }
+} finally {
+  queueLock.unlock();
+}
+  }
+
+  /**
+   * Fetch Next pending compaction if available.
+   *
+   * @return
+   * @throws InterruptedException
+   */
+  private HoodieInstant fetchNextCompactionInstant() throws 
InterruptedException {
+LOG.info("Compactor waiting for next instant for compaction upto 60 
seconds");
+HoodieInstant instant = pendingCompactions.poll(10, TimeUnit.SECONDS);
+if (instant != null) {
+  try {
+queueLock.lock();
+// Signal waiting thread
+consumed.signal();
+  } finally {
+queueLock.unlock();
+  }
+}
+return instant;
+  }
+
+  /**
+   * Start Compaction Service.
+   */
+  @Override
+  protected Pair startService() {
+ExecutorService executor = 
Executors.newFixedThreadPool(maxConcurrentCompaction,
+r -> new Thread(r, "async_compact_thread"));
+return Pair.of(CompletableFuture.allOf(IntStream.range(0, 
maxConcurrentCompaction).mapToObj(i -> CompletableFuture.supplyAsync(() -> {
+  try {
+// Set Compactor Pool Name for allowing users to prioritize compaction
+

  1   2   >