[GitHub] [hudi] stream2000 commented on a diff in pull request #9558: [HUDI-6481] Support run multi tables services in a single spark job
stream2000 commented on code in PR #9558: URL: https://github.com/apache/hudi/pull/9558#discussion_r1311178312 ## hudi-utilities/src/test/java/org/apache/hudi/utilities/multitable/TestHoodieMultiTableServicesMain.java: ## @@ -0,0 +1,304 @@ +/* + * + * * Licensed to the Apache Software Foundation (ASF) under one or more + * * contributor license agreements. See the NOTICE file distributed with + * * this work for additional information regarding copyright ownership. + * * The ASF licenses this file to You under the Apache License, Version 2.0 + * * (the "License"); you may not use this file except in compliance with + * * the License. You may obtain a copy of the License at + * * + * *http://www.apache.org/licenses/LICENSE-2.0 + * * + * * Unless required by applicable law or agreed to in writing, software + * * distributed under the License is distributed on an "AS IS" BASIS, + * * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * * See the License for the specific language governing permissions and + * * limitations under the License. + * + */ + +package org.apache.hudi.utilities.multitable; + +import org.apache.hudi.client.SparkRDDReadClient; +import org.apache.hudi.client.SparkRDDWriteClient; +import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.client.common.HoodieSparkEngineContext; +import org.apache.hudi.common.config.HoodieMetadataConfig; +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.model.HoodieCleaningPolicy; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieTableType; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.table.timeline.HoodieInstantTimeGenerator; +import org.apache.hudi.common.testutils.HoodieCommonTestHarness; +import org.apache.hudi.common.testutils.HoodieTestDataGenerator; +import org.apache.hudi.common.testutils.HoodieTestUtils; +import org.apache.hudi.config.HoodieCleanConfig; +import org.apache.hudi.config.HoodieCompactionConfig; +import org.apache.hudi.config.HoodieIndexConfig; +import org.apache.hudi.config.HoodieLayoutConfig; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.index.HoodieIndex; +import org.apache.hudi.keygen.constant.KeyGeneratorOptions; +import org.apache.hudi.table.action.commit.SparkBucketIndexPartitioner; +import org.apache.hudi.table.storage.HoodieStorageLayout; +import org.apache.hudi.testutils.providers.SparkProvider; +import org.apache.hudi.utilities.HoodieCompactor; + +import org.apache.hadoop.fs.Path; +import org.apache.spark.HoodieSparkKryoRegistrar$; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.sql.SQLContext; +import org.apache.spark.sql.SparkSession; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.Properties; +import java.util.concurrent.ExecutionException; + +import static org.apache.hudi.testutils.Assertions.assertNoWriteErrors; + +class TestHoodieMultiTableServicesMain extends HoodieCommonTestHarness implements SparkProvider { + + private static final Logger LOG = LoggerFactory.getLogger(ArchiveTask.class); + + protected boolean initialized = false; + + private static SparkSession spark; + private static SQLContext sqlContext; + private static JavaSparkContext jsc; + private static HoodieSparkEngineContext context; + + protected transient HoodieTestDataGenerator dataGen = null; + + @BeforeEach + public void init() throws IOException, ExecutionException, InterruptedException { +boolean initialized = spark != null; +if (!initialized) { + SparkConf sparkConf = conf(); + HoodieSparkKryoRegistrar$.MODULE$.register(sparkConf); + SparkRDDReadClient.addHoodieSupport(sparkConf); + spark = SparkSession.builder().config(sparkConf).getOrCreate(); + sqlContext = spark.sqlContext(); + jsc = new JavaSparkContext(spark.sparkContext()); + context = new HoodieSparkEngineContext(jsc); +} +initPath(); +prepareData(); + } + + @Test + public void testRunAllServices() throws IOException, ExecutionException, InterruptedException { +HoodieMultiTableServicesMain.Config cfg = getHoodieMultiServiceConfig(); +cfg.batch = true; +HoodieTableMetaClient metaClient1 = getMetaClient("table1"); +HoodieTableMetaClient metaClient2 = getMetaClient("table2"); +HoodieMultiTableServicesMain main = new HoodieMultiTableServicesMain(jsc, cfg); +main.startServices(); +// Verify cleans +Assertions.assertEquals(1, metaClient1.reloadActiveTimeline().ge
[GitHub] [hudi] hudi-bot commented on pull request #9585: [HUDI-6809] Optimizing the judgment of generating clustering plans
hudi-bot commented on PR #9585: URL: https://github.com/apache/hudi/pull/9585#issuecomment-1700470190 ## CI report: * b64e26ea48e2ff75920d3493be1d92d00c1471bd Azure: [PENDING](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=19571) Bot commands @hudi-bot supports the following commands: - `@hudi-bot run azure` re-run the last Azure build -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] hudi-bot commented on pull request #9584: [HUDI-6808] SkipCompaction Config should not affect the stream read of the cow table
hudi-bot commented on PR #9584: URL: https://github.com/apache/hudi/pull/9584#issuecomment-1700470149 ## CI report: * f0e27d3e68eda3b0eec63c3bfe4cbede271b6127 Azure: [PENDING](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=19570) Bot commands @hudi-bot supports the following commands: - `@hudi-bot run azure` re-run the last Azure build -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] hudi-bot commented on pull request #9585: [HUDI-6809] Optimizing the judgment of generating clustering plans
hudi-bot commented on PR #9585: URL: https://github.com/apache/hudi/pull/9585#issuecomment-1700423426 ## CI report: * b64e26ea48e2ff75920d3493be1d92d00c1471bd UNKNOWN Bot commands @hudi-bot supports the following commands: - `@hudi-bot run azure` re-run the last Azure build -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] hudi-bot commented on pull request #9584: [HUDI-6808] SkipCompaction Config should not affect the stream read of the cow table
hudi-bot commented on PR #9584: URL: https://github.com/apache/hudi/pull/9584#issuecomment-1700423386 ## CI report: * f0e27d3e68eda3b0eec63c3bfe4cbede271b6127 UNKNOWN Bot commands @hudi-bot supports the following commands: - `@hudi-bot run azure` re-run the last Azure build -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] hudi-bot commented on pull request #9581: [HUDI-6795] Implement writing record_positions to log blocks for updates and deletes
hudi-bot commented on PR #9581: URL: https://github.com/apache/hudi/pull/9581#issuecomment-1700415371 ## CI report: * 1208189ffb60441f9544933a2446ad194509c391 Azure: [FAILURE](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=19565) Bot commands @hudi-bot supports the following commands: - `@hudi-bot run azure` re-run the last Azure build -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] hudi-bot commented on pull request #9546: [HUDI-6397] [HUDI-6759] Fixing misc bugs w/ metadata table
hudi-bot commented on PR #9546: URL: https://github.com/apache/hudi/pull/9546#issuecomment-1700415179 ## CI report: * 98d5cfb130ae7ced0a32be01ff024afc9684d67a Azure: [FAILURE](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=19562) Bot commands @hudi-bot supports the following commands: - `@hudi-bot run azure` re-run the last Azure build -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] zhuanshenbsj1 opened a new pull request, #9585: [HUDI-6809] Optimizing the judgment of generating clustering plans
zhuanshenbsj1 opened a new pull request, #9585: URL: https://github.com/apache/hudi/pull/9585 ### Change Logs We currently uses Flink to generate clustering plans online, and then Spark to execute them offline. What we expect is to generate a clustering plan every four commits,like: commit1,commit2,commit3,commit4,clustering1,commit5,commit6,commit7,commit8,clustering2 However, when Spark executes slowly offline, it can cause the timeline to become: commit1,commit2,commit3,commit4,clustering1,commit5,clustering2,commit6,clustering3 This is not what we want. We should determine whether plan generation has been enabled, rather than whether plan execution has been enabled. ### Impact _Describe any public API or user-facing feature change or any performance impact._ ### Risk level (write none, low medium or high below) _If medium or high, explain what verification was done to mitigate the risks._ ### Documentation Update _Describe any necessary documentation update if there is any new feature, config, or user-facing change_ - _The config description must be updated if new configs are added or the default value of the configs are changed_ - _Any new feature or user-facing change requires updating the Hudi website. Please create a Jira ticket, attach the ticket number here and follow the [instruction](https://hudi.apache.org/contribute/developer-setup#website) to make changes to the website._ ### Contributor's checklist - [ ] Read through [contributor's guide](https://hudi.apache.org/contribute/how-to-contribute) - [ ] Change Logs and Impact were stated clearly - [ ] Adequate tests were added if applicable - [ ] CI passed -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[jira] [Created] (HUDI-6809) Optimizing the judgment of generating clustering plans
zhuanshenbsj1 created HUDI-6809: --- Summary: Optimizing the judgment of generating clustering plans Key: HUDI-6809 URL: https://issues.apache.org/jira/browse/HUDI-6809 Project: Apache Hudi Issue Type: Improvement Components: clustering Reporter: zhuanshenbsj1 Fix For: 0.14.0 -- This message was sent by Atlassian Jira (v8.20.10#820010)
[GitHub] [hudi] zhuanshenbsj1 opened a new pull request, #9584: [HUDI-6808] SkipCompaction Config should not affect the stream read of the cow table
zhuanshenbsj1 opened a new pull request, #9584: URL: https://github.com/apache/hudi/pull/9584 ### Change Logs The same action type(commit) is used after the completion of mor-compaction and cow-commit. If skipcompaction is configured, it will cause the stream read to skip the normal commit when reading the row table. SkipCompaction Config should not affect the stream read of the cow table . ### Impact _Describe any public API or user-facing feature change or any performance impact._ ### Risk level (write none, low medium or high below) _If medium or high, explain what verification was done to mitigate the risks._ ### Documentation Update _Describe any necessary documentation update if there is any new feature, config, or user-facing change_ - _The config description must be updated if new configs are added or the default value of the configs are changed_ - _Any new feature or user-facing change requires updating the Hudi website. Please create a Jira ticket, attach the ticket number here and follow the [instruction](https://hudi.apache.org/contribute/developer-setup#website) to make changes to the website._ ### Contributor's checklist - [ ] Read through [contributor's guide](https://hudi.apache.org/contribute/how-to-contribute) - [ ] Change Logs and Impact were stated clearly - [ ] Adequate tests were added if applicable - [ ] CI passed -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] viverlxl closed issue #9575: [SUPPORT] sparkSql create table sync metastore Failed
viverlxl closed issue #9575: [SUPPORT] sparkSql create table sync metastore Failed URL: https://github.com/apache/hudi/issues/9575 -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[jira] [Created] (HUDI-6808) SkipCompaction Config should not affect the stream read of the cow table
zhuanshenbsj1 created HUDI-6808: --- Summary: SkipCompaction Config should not affect the stream read of the cow table Key: HUDI-6808 URL: https://issues.apache.org/jira/browse/HUDI-6808 Project: Apache Hudi Issue Type: Improvement Reporter: zhuanshenbsj1 Fix For: 0.14.0 -- This message was sent by Atlassian Jira (v8.20.10#820010)
[GitHub] [hudi] codope commented on a diff in pull request #9565: [HUDI-6725] Support efficient completion time queries on the timeline
codope commented on code in PR #9565: URL: https://github.com/apache/hudi/pull/9565#discussion_r1311099826 ## hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/timeline/CompletionTimeQueryView.java: ## @@ -0,0 +1,149 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.client.timeline; + +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.table.timeline.HoodieArchivedTimeline; +import org.apache.hudi.common.table.timeline.HoodieInstant; +import org.apache.hudi.common.table.timeline.HoodieTimeline; +import org.apache.hudi.common.util.Option; + +import org.apache.avro.generic.GenericRecord; + +import java.util.Map; +import java.util.concurrent.ConcurrentHashMap; + +import static org.apache.hudi.common.table.timeline.HoodieArchivedTimeline.COMPLETION_TIME_ARCHIVED_META_FIELD; +import static org.apache.hudi.common.table.timeline.HoodieTimeline.EQUALS; +import static org.apache.hudi.common.table.timeline.HoodieTimeline.GREATER_THAN; + +/** + * Query view for instant completion time. + */ +public class CompletionTimeQueryView implements AutoCloseable { + private final HoodieTableMetaClient metaClient; + + /** + * Mapping from instant start time -> completion time. + * Should be thread-safe data structure. + */ + private final Map startToCompletionInstantTimeMap; + + /** + * The start instant time to eagerly load from, by default load last 3 days of completed instants. + */ + private final String startInstant; + + /** + * The first instant on the active timeline, used for query optimization. + */ + private final String firstInstantOnActiveTimeline; + + /** + * The constructor. + * + * @param metaClient The table meta client. + * @param startInstant The earliest instant time to eagerly load from, by default load last 3 days of completed instants. + */ + public CompletionTimeQueryView(HoodieTableMetaClient metaClient, String startInstant) { +this.metaClient = metaClient; +this.startToCompletionInstantTimeMap = new ConcurrentHashMap<>(); +this.startInstant = startInstant; +this.firstInstantOnActiveTimeline = metaClient.getActiveTimeline().firstInstant().map(HoodieInstant::getTimestamp).orElse(""); +load(); + } + + /** + * Queries the instant completion time with given start time. + * + * @param startTime The start time. + * + * @return The completion time if the instant finished or empty if it is still pending. + */ + public Option getCompletionTime(String startTime) { +String completionTime = this.startToCompletionInstantTimeMap.get(startTime); +if (completionTime != null) { + return Option.of(completionTime); +} +if (HoodieTimeline.compareTimestamps(startTime, GREATER_THAN, this.firstInstantOnActiveTimeline)) { + // the instant is still pending + return Option.empty(); +} +// the 'startTime' should be out of the eager loading range, switch to a lazy loading. +// This operation is resource costly. +HoodieArchivedTimeline.loadInstants(metaClient, +new EQTsFilter(startTime), +HoodieArchivedTimeline.LoadMode.SLIM, +r -> true, +this::readCompletionTime); +return Option.ofNullable(this.startToCompletionInstantTimeMap.get(startTime)); + } + + /** + * This is method to read instant completion time. + * This would also update 'startToCompletionInstantTimeMap' map with start time/completion time pairs. + * Only instants starts from 'startInstant' (inclusive) are considered. + */ + private void load() { +// load active instants first. +this.metaClient.getActiveTimeline() +.filterCompletedInstants().getInstantsAsStream() +.forEach(instant -> setCompletionTime(instant.getTimestamp(), instant.getStateTransitionTime())); +// then load the archived instants. +HoodieArchivedTimeline.loadInstants(metaClient, +new HoodieArchivedTimeline.StartTsFilter(this.startInstant), +HoodieArchivedTimeline.LoadMode.SLIM, +r -> true, +this::readCompletionTime); + } + + private void readCompletionTime(String instantTime, GenericRecord record) { +
[GitHub] [hudi] hudi-bot commented on pull request #9572: [WIP][HUDI-6702]Utilize merger to replace insertValue api
hudi-bot commented on PR #9572: URL: https://github.com/apache/hudi/pull/9572#issuecomment-1700369972 ## CI report: * ad05887b523496f59ac8b6e976183d6c325ed94d UNKNOWN * cf848446b9c837be3c1c2fdc7930b26f920a0754 Azure: [FAILURE](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=19563) Bot commands @hudi-bot supports the following commands: - `@hudi-bot run azure` re-run the last Azure build -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] hudi-bot commented on pull request #9571: Enabling comprehensive schema evolution in delta streamer code
hudi-bot commented on PR #9571: URL: https://github.com/apache/hudi/pull/9571#issuecomment-1700369921 ## CI report: * 3af6011d72b294b0995d52be40a6d91e6eff9a1b Azure: [FAILURE](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=19561) Bot commands @hudi-bot supports the following commands: - `@hudi-bot run azure` re-run the last Azure build -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] hudi-bot commented on pull request #9583: [Test]update operator name for compact&clustering test class
hudi-bot commented on PR #9583: URL: https://github.com/apache/hudi/pull/9583#issuecomment-1700342790 ## CI report: * 4f265efa3f9a216c511abf94c065700e74b21679 Azure: [PENDING](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=19569) Bot commands @hudi-bot supports the following commands: - `@hudi-bot run azure` re-run the last Azure build -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] hudi-bot commented on pull request #9583: [Test]update operator name for compact&clustering test class
hudi-bot commented on PR #9583: URL: https://github.com/apache/hudi/pull/9583#issuecomment-1700337856 ## CI report: * 4f265efa3f9a216c511abf94c065700e74b21679 UNKNOWN Bot commands @hudi-bot supports the following commands: - `@hudi-bot run azure` re-run the last Azure build -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] hudi-bot commented on pull request #9577: [HUDI-6805] Print detailed error message in clustering
hudi-bot commented on PR #9577: URL: https://github.com/apache/hudi/pull/9577#issuecomment-1700337802 ## CI report: * 9d1b03d93f9f5bfab485a89e4b3de9aa9cca4f17 Azure: [FAILURE](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=19550) * aab8e18c70e3d488f920659ee8c210fde5bddf1c Azure: [PENDING](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=19568) Bot commands @hudi-bot supports the following commands: - `@hudi-bot run azure` re-run the last Azure build -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] hudi-bot commented on pull request #9577: [HUDI-6805] Print detailed error message in clustering
hudi-bot commented on PR #9577: URL: https://github.com/apache/hudi/pull/9577#issuecomment-1700332724 ## CI report: * 9d1b03d93f9f5bfab485a89e4b3de9aa9cca4f17 Azure: [FAILURE](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=19550) * aab8e18c70e3d488f920659ee8c210fde5bddf1c UNKNOWN Bot commands @hudi-bot supports the following commands: - `@hudi-bot run azure` re-run the last Azure build -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] hudi-bot commented on pull request #9564: [HUDI-6712] Add Parquet file metadata loader
hudi-bot commented on PR #9564: URL: https://github.com/apache/hudi/pull/9564#issuecomment-1700332631 ## CI report: * f6a01d87b32aebfce310375b8925f4802acca686 Azure: [FAILURE](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=19560) Bot commands @hudi-bot supports the following commands: - `@hudi-bot run azure` re-run the last Azure build -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] hudi-bot commented on pull request #9482: [HUDI-6728] Update BigQuery manifest sync to support schema evolution
hudi-bot commented on PR #9482: URL: https://github.com/apache/hudi/pull/9482#issuecomment-1700332485 ## CI report: * 39166302aadd51524e017f92a883e960e07a37a4 Azure: [FAILURE](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=19426) * b84a6f31d753b486645d333fd645f7841de3e6e8 Azure: [PENDING](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=19567) Bot commands @hudi-bot supports the following commands: - `@hudi-bot run azure` re-run the last Azure build -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] voonhous commented on issue #9536: Duplicate Row in Same Partition using Global Bloom Index
voonhous commented on issue #9536: URL: https://github.com/apache/hudi/issues/9536#issuecomment-1700326761 FWIU, this is a sporadic thing that OP is not able to reproduce anymore. Might be related to this issue: https://github.com/apache/hudi/pull/9035 One way to determine if it is caused by this issue is: 1. Identify the 2 parquet files that the 2 files are situated in 2. If it is caused by the issue linked above, the commit time should be the same (assuming COW table) 3. If it is this issue and if you are still able to access your Spark tracking URL, you can probably look at the timing of the stages and see if there's a zombie executor/task has not been killed after `reconcileAgainstMarker` has been called. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] hehuiyuan opened a new pull request, #9583: update operator name for compact&clustering test class
hehuiyuan opened a new pull request, #9583: URL: https://github.com/apache/hudi/pull/9583 ### Change Logs update some error operator name and uid -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[jira] [Closed] (HUDI-3727) Add metrics for async indexer
[ https://issues.apache.org/jira/browse/HUDI-3727?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] Sagar Sumit closed HUDI-3727. - Resolution: Done > Add metrics for async indexer > - > > Key: HUDI-3727 > URL: https://issues.apache.org/jira/browse/HUDI-3727 > Project: Apache Hudi > Issue Type: Task >Reporter: Sagar Sumit >Assignee: Sagar Sumit >Priority: Major > Labels: pull-request-available > Fix For: 1.0.0 > > > Add metrics for async metadata indexer, e.g. time for base file > initialization, time for catch up etc. -- This message was sent by Atlassian Jira (v8.20.10#820010)
[GitHub] [hudi] aajisaka commented on issue #8160: [SUPPORT] Schema evolution wrt to datatype promotion isnt working. org.apache.avro.AvroRuntimeException: cannot support rewrite value for schema type
aajisaka commented on issue #8160: URL: https://github.com/apache/hudi/issues/8160#issuecomment-1700309154 @ad1happy2go I think we can just close this issue given there's no response from the requester. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] hudi-bot commented on pull request #9582: [MINOR]Fix hbase index config improper use
hudi-bot commented on PR #9582: URL: https://github.com/apache/hudi/pull/9582#issuecomment-1700308019 ## CI report: * cc93db8d3ad775d3fd244d07ad17786596377e55 Azure: [PENDING](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=19566) Bot commands @hudi-bot supports the following commands: - `@hudi-bot run azure` re-run the last Azure build -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] hudi-bot commented on pull request #9482: [HUDI-6728] Update BigQuery manifest sync to support schema evolution
hudi-bot commented on PR #9482: URL: https://github.com/apache/hudi/pull/9482#issuecomment-1700307775 ## CI report: * 39166302aadd51524e017f92a883e960e07a37a4 Azure: [FAILURE](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=19426) * b84a6f31d753b486645d333fd645f7841de3e6e8 UNKNOWN Bot commands @hudi-bot supports the following commands: - `@hudi-bot run azure` re-run the last Azure build -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] aajisaka commented on a diff in pull request #9577: [HUDI-6805] Print detailed error message in clustering
aajisaka commented on code in PR #9577: URL: https://github.com/apache/hudi/pull/9577#discussion_r1311043084 ## hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/io/storage/row/HoodieRowCreateHandle.java: ## @@ -241,6 +242,9 @@ public WriteStatus close() throws IOException { stat.setTotalWriteBytes(fileSizeInBytes); stat.setFileSizeInBytes(fileSizeInBytes); stat.setTotalWriteErrors(writeStatus.getTotalErrorRecords()); +for (Pair pair : writeStatus.getFailedRecords()) { + LOG.info("Failed to write {}", pair.getLeft(), pair.getRight()); Review Comment: Agreed we should use ERROR log level, but using two `{}` doesn't make sense to me. It will run `Throwable.toString()` and then print only the short description. Stacktrace won't be printed. Instead we should directly pass the Throwable to the logging API. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] hudi-bot commented on pull request #9582: [MINOR]Fix hbase index config improper use
hudi-bot commented on PR #9582: URL: https://github.com/apache/hudi/pull/9582#issuecomment-1700303079 ## CI report: * cc93db8d3ad775d3fd244d07ad17786596377e55 UNKNOWN Bot commands @hudi-bot supports the following commands: - `@hudi-bot run azure` re-run the last Azure build -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] yihua commented on issue #9430: [SUPPORT] Problem when refactor a custom payload to new API defined in RFC-46
yihua commented on issue #9430: URL: https://github.com/apache/hudi/issues/9430#issuecomment-1700302192 Great! @beyond1920 what's your Slack or Wechat handle? Since @linliu-code is fixing this and making it production ready, three of us should sync and see how we can divide the work here. You can still focus on support the new merger API for other engines like Flink. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] hudi-bot commented on pull request #9581: [HUDI-6795] Implement writing record_positions to log blocks for updates and deletes
hudi-bot commented on PR #9581: URL: https://github.com/apache/hudi/pull/9581#issuecomment-1700297752 ## CI report: * 1208189ffb60441f9544933a2446ad194509c391 Azure: [PENDING](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=19565) Bot commands @hudi-bot supports the following commands: - `@hudi-bot run azure` re-run the last Azure build -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] flashJd opened a new pull request, #9582: [MINOR]Fix hbase index config improper use
flashJd opened a new pull request, #9582: URL: https://github.com/apache/hudi/pull/9582 ### Change Logs _Describe context and summary for this change. Highlight if any code was copied._ ### Impact N/A ### Risk level (write none, low medium or high below) N/A ### Documentation Update N/A ### Contributor's checklist - [ ] Read through [contributor's guide](https://hudi.apache.org/contribute/how-to-contribute) - [ ] Change Logs and Impact were stated clearly - [ ] Adequate tests were added if applicable - [ ] CI passed -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] nsivabalan commented on a diff in pull request #9482: [HUDI-6728] Update BigQuery manifest sync to support schema evolution
nsivabalan commented on code in PR #9482: URL: https://github.com/apache/hudi/pull/9482#discussion_r1311022781 ## hudi-gcp/src/main/java/org/apache/hudi/gcp/bigquery/BigQuerySyncTool.java: ## @@ -52,34 +57,55 @@ public class BigQuerySyncTool extends HoodieSyncTool { private static final Logger LOG = LoggerFactory.getLogger(BigQuerySyncTool.class); - public final BigQuerySyncConfig config; - public final String tableName; - public final String manifestTableName; - public final String versionsTableName; - public final String snapshotViewName; + private final BigQuerySyncConfig config; + private final String tableName; + private final String manifestTableName; + private final String versionsTableName; + private final String snapshotViewName; + private final ManifestFileWriter manifestFileWriter; + private final HoodieBigQuerySyncClient bqSyncClient; + private final HoodieTableMetaClient metaClient; + private final BigQuerySchemaResolver bqSchemaResolver; public BigQuerySyncTool(Properties props) { -super(props); +// will build file writer, client, etc. from configs +this(props, null, null, null, null); Review Comment: can we do the initialization here only and avoid ``` this.bqSyncClient = providedBqSyncClient == null ? new HoodieBigQuerySyncClient(config) : providedBqSyncClient; ``` calls like this in the other constructor. ## hudi-gcp/src/main/java/org/apache/hudi/gcp/bigquery/BigQuerySyncTool.java: ## @@ -52,34 +57,55 @@ public class BigQuerySyncTool extends HoodieSyncTool { private static final Logger LOG = LoggerFactory.getLogger(BigQuerySyncTool.class); - public final BigQuerySyncConfig config; - public final String tableName; - public final String manifestTableName; - public final String versionsTableName; - public final String snapshotViewName; + private final BigQuerySyncConfig config; + private final String tableName; + private final String manifestTableName; + private final String versionsTableName; + private final String snapshotViewName; + private final ManifestFileWriter manifestFileWriter; + private final HoodieBigQuerySyncClient bqSyncClient; + private final HoodieTableMetaClient metaClient; + private final BigQuerySchemaResolver bqSchemaResolver; public BigQuerySyncTool(Properties props) { -super(props); +// will build file writer, client, etc. from configs +this(props, null, null, null, null); + } + + @VisibleForTesting // allows us to pass in mocks for the writer and client + BigQuerySyncTool(Properties properties, ManifestFileWriter manifestFileWriter, HoodieBigQuerySyncClient providedBqSyncClient, HoodieTableMetaClient providedMetaClient, + BigQuerySchemaResolver providedBqSchemaResolver) { +super(properties); this.config = new BigQuerySyncConfig(props); this.tableName = config.getString(BIGQUERY_SYNC_TABLE_NAME); this.manifestTableName = tableName + "_manifest"; this.versionsTableName = tableName + "_versions"; this.snapshotViewName = tableName; +this.bqSyncClient = providedBqSyncClient == null ? new HoodieBigQuerySyncClient(config) : providedBqSyncClient; +// reuse existing meta client if not provided (only test cases will provide their own meta client) Review Comment: lets not block this PR on this discussion. we can take it async. lets go ahead w/ the patch. I am good -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] FishMAN002 commented on issue #9506: [SUPPORT] ctas error in spark3.1.1 & hudi 0.13.1
FishMAN002 commented on issue #9506: URL: https://github.com/apache/hudi/issues/9506#issuecomment-1700277934 > @ad1happy2go Are you suggesting that I try this command: ``` /usr/local/opt/apache-maven-3.8.5/bin/mvn clean package -DskipTests -Dspark3.1 -Dflink1.14 -Dscala-2.12 -Drat.skip=true -Dcheckstyle.skip=true **--packages org.apache.hudi:hudi-spark3.1-bundle_2.12:0.13.1** ``` If that's not the case, could you provide me with a complete command?Thank you very much ! -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[jira] [Updated] (HUDI-6780) Replace classnames by modes/enums in table properties
[ https://issues.apache.org/jira/browse/HUDI-6780?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] Sagar Sumit updated HUDI-6780: -- Status: In Progress (was: Open) > Replace classnames by modes/enums in table properties > - > > Key: HUDI-6780 > URL: https://issues.apache.org/jira/browse/HUDI-6780 > Project: Apache Hudi > Issue Type: Task >Reporter: Sagar Sumit >Assignee: Sagar Sumit >Priority: Major > Fix For: 1.0.0 > > -- This message was sent by Atlassian Jira (v8.20.10#820010)
[jira] [Updated] (HUDI-6779) Audit current hoodie.properties
[ https://issues.apache.org/jira/browse/HUDI-6779?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] Sagar Sumit updated HUDI-6779: -- Status: In Progress (was: Open) > Audit current hoodie.properties > --- > > Key: HUDI-6779 > URL: https://issues.apache.org/jira/browse/HUDI-6779 > Project: Apache Hudi > Issue Type: Task >Reporter: Sagar Sumit >Assignee: Sagar Sumit >Priority: Major > Fix For: 1.0.0 > > > Remove some configs from table to write configs -- This message was sent by Atlassian Jira (v8.20.10#820010)
[jira] [Updated] (HUDI-6776) Unify commit metadata content in json for completed and avro for pending commits
[ https://issues.apache.org/jira/browse/HUDI-6776?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] Sagar Sumit updated HUDI-6776: -- Status: In Progress (was: Open) > Unify commit metadata content in json for completed and avro for pending > commits > > > Key: HUDI-6776 > URL: https://issues.apache.org/jira/browse/HUDI-6776 > Project: Apache Hudi > Issue Type: Task >Reporter: Sagar Sumit >Assignee: Sagar Sumit >Priority: Major > Labels: pull-request-available > Fix For: 1.0.0 > > -- This message was sent by Atlassian Jira (v8.20.10#820010)
[GitHub] [hudi] hudi-bot commented on pull request #9581: [HUDI-6795] Implement writing record_positions to log blocks for updates and deletes
hudi-bot commented on PR #9581: URL: https://github.com/apache/hudi/pull/9581#issuecomment-1700264208 ## CI report: * 1208189ffb60441f9544933a2446ad194509c391 UNKNOWN Bot commands @hudi-bot supports the following commands: - `@hudi-bot run azure` re-run the last Azure build -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] stream2000 commented on a diff in pull request #9515: [HUDI-2141] Support flink compaction metrics
stream2000 commented on code in PR #9515: URL: https://github.com/apache/hudi/pull/9515#discussion_r1311016199 ## hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/metrics/FlinkWriteMetrics.java: ## @@ -0,0 +1,130 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.metrics; + +import org.apache.hudi.common.model.HoodieCommitMetadata; +import org.apache.hudi.common.table.timeline.HoodieInstantTimeGenerator; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.collection.Pair; + +import org.apache.flink.metrics.MetricGroup; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.text.ParseException; + +/** + * Common flink write commit metadata metrics + */ +public class FlinkWriteMetrics extends HoodieFlinkMetrics { + + private static final Logger LOG = LoggerFactory.getLogger(FlinkWriteMetrics.class); + + protected final String actionType; + + private long totalPartitionsWritten; + private long totalFilesInsert; + private long totalFilesUpdate; + private long totalRecordsWritten; + private long totalUpdateRecordsWritten; + private long totalInsertRecordsWritten; + private long totalBytesWritten; + private long totalScanTime; + private long totalCreateTime; + private long totalUpsertTime; + private long totalCompactedRecordsUpdated; + private long totalLogFilesCompacted; + private long totalLogFilesSize; + private long commitLatencyInMs; + private long commitFreshnessInMs; + private long commitEpochTimeInMs; + private long durationInMs; + + public FlinkWriteMetrics(MetricGroup metricGroup, String actionType) { +super(metricGroup); +this.actionType = actionType; + } + + @Override + public void registerMetrics() { +// register commit gauge +metricGroup.gauge(getMetricsName(actionType, "totalPartitionsWritten"), () -> totalPartitionsWritten); +metricGroup.gauge(getMetricsName(actionType, "totalFilesInsert"), () -> totalFilesInsert); +metricGroup.gauge(getMetricsName(actionType, "totalFilesUpdate"), () -> totalFilesUpdate); +metricGroup.gauge(getMetricsName(actionType, "totalRecordsWritten"), () -> totalRecordsWritten); +metricGroup.gauge(getMetricsName(actionType, "totalUpdateRecordsWritten"), () -> totalUpdateRecordsWritten); +metricGroup.gauge(getMetricsName(actionType, "totalInsertRecordsWritten"), () -> totalInsertRecordsWritten); +metricGroup.gauge(getMetricsName(actionType, "totalBytesWritten"), () -> totalBytesWritten); +metricGroup.gauge(getMetricsName(actionType, "totalScanTime"), () -> totalScanTime); +metricGroup.gauge(getMetricsName(actionType, "totalCreateTime"), () -> totalCreateTime); +metricGroup.gauge(getMetricsName(actionType, "totalUpsertTime"), () -> totalUpsertTime); +metricGroup.gauge(getMetricsName(actionType, "totalCompactedRecordsUpdated"), () -> totalCompactedRecordsUpdated); Review Comment: I think we can make use of these commit metrics in the near future once [flink support coordinator metrics](https://cwiki.apache.org/confluence/display/FLINK/FLIP-274%3A+Introduce+metric+group+for+OperatorCoordinator). SO we can keep them now sothat we won't do large modification if we can gather metrics from `StreamwriteCoordinator` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] hudi-bot commented on pull request #9538: [HUDI-6738] - Apply object filter before checkpoint batching in GcsEventsHoodieIncrSource
hudi-bot commented on PR #9538: URL: https://github.com/apache/hudi/pull/9538#issuecomment-1700249267 ## CI report: * d8d12bf0d3d2c24b0f03be4faf4c293c70db9ecd Azure: [FAILURE](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=19559) Bot commands @hudi-bot supports the following commands: - `@hudi-bot run azure` re-run the last Azure build -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[jira] [Updated] (HUDI-6795) Implement generation of record_positions for updates and deletes on write path
[ https://issues.apache.org/jira/browse/HUDI-6795?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] ASF GitHub Bot updated HUDI-6795: - Labels: pull-request-available (was: ) > Implement generation of record_positions for updates and deletes on write path > -- > > Key: HUDI-6795 > URL: https://issues.apache.org/jira/browse/HUDI-6795 > Project: Apache Hudi > Issue Type: New Feature >Reporter: Ethan Guo >Assignee: Ethan Guo >Priority: Blocker > Labels: pull-request-available > Fix For: 1.0.0 > > -- This message was sent by Atlassian Jira (v8.20.10#820010)
[GitHub] [hudi] yihua opened a new pull request, #9581: [HUDI-6795] Implement writing record_positions to log blocks for updates and deletes
yihua opened a new pull request, #9581: URL: https://github.com/apache/hudi/pull/9581 ### Change Logs _Describe context and summary for this change. Highlight if any code was copied._ ### Impact _Describe any public API or user-facing feature change or any performance impact._ ### Risk level (write none, low medium or high below) _If medium or high, explain what verification was done to mitigate the risks._ ### Documentation Update _Describe any necessary documentation update if there is any new feature, config, or user-facing change_ - _The config description must be updated if new configs are added or the default value of the configs are changed_ - _Any new feature or user-facing change requires updating the Hudi website. Please create a Jira ticket, attach the ticket number here and follow the [instruction](https://hudi.apache.org/contribute/developer-setup#website) to make changes to the website._ ### Contributor's checklist - [ ] Read through [contributor's guide](https://hudi.apache.org/contribute/how-to-contribute) - [ ] Change Logs and Impact were stated clearly - [ ] Adequate tests were added if applicable - [ ] CI passed -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] empcl commented on a diff in pull request #9580: automatically create a database when using the flink catalog dfs mode
empcl commented on code in PR #9580: URL: https://github.com/apache/hudi/pull/9580#discussion_r1311005106 ## pom.xml: ## @@ -1714,6 +1714,11 @@ + + nexus-aliyun + Nexus aliyun Review Comment: Sorry, it was too late at the time and I didn't pay much attention. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] danny0405 commented on a diff in pull request #9580: automatically create a database when using the flink catalog dfs mode
danny0405 commented on code in PR #9580: URL: https://github.com/apache/hudi/pull/9580#discussion_r1311004467 ## hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/table/catalog/HoodieCatalog.java: ## @@ -125,6 +125,15 @@ public void open() throws CatalogException { } catch (IOException e) { throw new CatalogException(String.format("Checking catalog path %s exists exception.", catalogPathStr), e); } +if (!databaseExists(getDefaultDatabase())) { + LOG.info("{} does not exist, will be created.", getDefaultDatabase()); + String dbPath = catalogPath + "/" + getDefaultDatabase(); + try { +fs.mkdirs(new Path(dbPath)); + } catch (IOException e) { +throw new RuntimeException(String.format("mkdir db path %s exception.", dbPath), e); Review Comment: error creating database path {} ... -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] danny0405 commented on a diff in pull request #9580: automatically create a database when using the flink catalog dfs mode
danny0405 commented on code in PR #9580: URL: https://github.com/apache/hudi/pull/9580#discussion_r1311004246 ## hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/table/catalog/HoodieCatalog.java: ## @@ -125,6 +125,15 @@ public void open() throws CatalogException { } catch (IOException e) { throw new CatalogException(String.format("Checking catalog path %s exists exception.", catalogPathStr), e); } +if (!databaseExists(getDefaultDatabase())) { + LOG.info("{} does not exist, will be created.", getDefaultDatabase()); Review Comment: `Creating database {} automically because it does not exist` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] danny0405 commented on a diff in pull request #9580: automatically create a database when using the flink catalog dfs mode
danny0405 commented on code in PR #9580: URL: https://github.com/apache/hudi/pull/9580#discussion_r1311003903 ## pom.xml: ## @@ -1714,6 +1714,11 @@ + + nexus-aliyun + Nexus aliyun Review Comment: do we need this? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[jira] [Closed] (HUDI-6763) WriteStats are extracted twice in BaseSparkCommitActionExecutor
[ https://issues.apache.org/jira/browse/HUDI-6763?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] Danny Chen closed HUDI-6763. Resolution: Fixed Fixed via master branch: 2f7e9caebb0e7f68a7cc1a9c541cc67440eafa44 > WriteStats are extracted twice in BaseSparkCommitActionExecutor > --- > > Key: HUDI-6763 > URL: https://issues.apache.org/jira/browse/HUDI-6763 > Project: Apache Hudi > Issue Type: Bug >Reporter: Timothy Brown >Assignee: Timothy Brown >Priority: Minor > Labels: pull-request-available > Fix For: 0.14.0 > > > In BaseSparkCommitActionExecutor there are two places the same > `collectAsList` is called on an RDD. We can optimize this by only calling > this method once. -- This message was sent by Atlassian Jira (v8.20.10#820010)
[jira] [Updated] (HUDI-6763) WriteStats are extracted twice in BaseSparkCommitActionExecutor
[ https://issues.apache.org/jira/browse/HUDI-6763?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] Danny Chen updated HUDI-6763: - Fix Version/s: 0.14.0 > WriteStats are extracted twice in BaseSparkCommitActionExecutor > --- > > Key: HUDI-6763 > URL: https://issues.apache.org/jira/browse/HUDI-6763 > Project: Apache Hudi > Issue Type: Bug >Reporter: Timothy Brown >Assignee: Timothy Brown >Priority: Minor > Labels: pull-request-available > Fix For: 0.14.0 > > > In BaseSparkCommitActionExecutor there are two places the same > `collectAsList` is called on an RDD. We can optimize this by only calling > this method once. -- This message was sent by Atlassian Jira (v8.20.10#820010)
[hudi] branch master updated: [HUDI-6763] Optimize collect calls (#9561)
This is an automated email from the ASF dual-hosted git repository. danny0405 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/hudi.git The following commit(s) were added to refs/heads/master by this push: new 2f7e9caebb0 [HUDI-6763] Optimize collect calls (#9561) 2f7e9caebb0 is described below commit 2f7e9caebb0e7f68a7cc1a9c541cc67440eafa44 Author: Tim Brown AuthorDate: Wed Aug 30 20:37:23 2023 -0500 [HUDI-6763] Optimize collect calls (#9561) --- .../table/action/commit/BaseSparkCommitActionExecutor.java | 14 +++--- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/BaseSparkCommitActionExecutor.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/BaseSparkCommitActionExecutor.java index 7383f428e0a..040cc798747 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/BaseSparkCommitActionExecutor.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/BaseSparkCommitActionExecutor.java @@ -286,7 +286,9 @@ public abstract class BaseSparkCommitActionExecutor extends @Override protected void setCommitMetadata(HoodieWriteMetadata> result) { - result.setCommitMetadata(Option.of(CommitUtils.buildMetadata(result.getWriteStatuses().map(WriteStatus::getStat).collectAsList(), +List writeStats = result.getWriteStatuses().map(WriteStatus::getStat).collectAsList(); +result.setWriteStats(writeStats); +result.setCommitMetadata(Option.of(CommitUtils.buildMetadata(writeStats, result.getPartitionToReplaceFileIds(), extraMetadata, operationType, getSchemaToStoreInCommit(), getCommitActionType(; } @@ -294,16 +296,14 @@ public abstract class BaseSparkCommitActionExecutor extends @Override protected void commit(Option> extraMetadata, HoodieWriteMetadata> result) { context.setJobStatus(this.getClass().getSimpleName(), "Commit write status collect: " + config.getTableName()); -commit(extraMetadata, result, result.getWriteStatuses().map(WriteStatus::getStat).collectAsList()); - } - - protected void commit(Option> extraMetadata, HoodieWriteMetadata> result, List writeStats) { String actionType = getCommitActionType(); LOG.info("Committing " + instantTime + ", action Type " + actionType + ", operation Type " + operationType); result.setCommitted(true); -result.setWriteStats(writeStats); +if (!result.getWriteStats().isPresent()) { + result.setWriteStats(result.getWriteStatuses().map(WriteStatus::getStat).collectAsList()); +} // Finalize write -finalizeWrite(instantTime, writeStats, result); +finalizeWrite(instantTime, result.getWriteStats().get(), result); try { HoodieActiveTimeline activeTimeline = table.getActiveTimeline(); HoodieCommitMetadata metadata = result.getCommitMetadata().get();
[GitHub] [hudi] CTTY commented on pull request #8929: [HUDI-6350] Allow athena to use the metadata table
CTTY commented on PR #8929: URL: https://github.com/apache/hudi/pull/8929#issuecomment-1700184768 I've manually tested this on EMR 6.12 by setting configs below when writing data: ``` .option(DataSourceWriteOptions.META_SYNC_CLIENT_TOOL_CLASS_NAME.key, "org.apache.hudi.aws.sync.AwsGlueCatalogSyncTool") .option(GlueCatalogSyncClientConfig.GLUE_METADATA_FILE_LISTING.key, "TRUE") ``` It works very well. Nice work! -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] hudi-bot commented on pull request #9572: [WIP][HUDI-6702]Utilize merger to replace insertValue api
hudi-bot commented on PR #9572: URL: https://github.com/apache/hudi/pull/9572#issuecomment-1700031447 ## CI report: * ad05887b523496f59ac8b6e976183d6c325ed94d UNKNOWN * 7e769e60b101466c27604ce531b95f42eab87885 Azure: [FAILURE](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=19546) * cf848446b9c837be3c1c2fdc7930b26f920a0754 Azure: [PENDING](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=19563) Bot commands @hudi-bot supports the following commands: - `@hudi-bot run azure` re-run the last Azure build -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] hudi-bot commented on pull request #9546: [HUDI-6397] [HUDI-6759] Fixing misc bugs w/ metadata table
hudi-bot commented on PR #9546: URL: https://github.com/apache/hudi/pull/9546#issuecomment-1700031017 ## CI report: * f391322cd2d754ce85fbd33ca516c19d688ab784 Azure: [FAILURE](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=19496) * 98d5cfb130ae7ced0a32be01ff024afc9684d67a Azure: [PENDING](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=19562) Bot commands @hudi-bot supports the following commands: - `@hudi-bot run azure` re-run the last Azure build -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] hudi-bot commented on pull request #9546: [HUDI-6397] [HUDI-6759] Fixing misc bugs w/ metadata table
hudi-bot commented on PR #9546: URL: https://github.com/apache/hudi/pull/9546#issuecomment-1700013861 ## CI report: * f391322cd2d754ce85fbd33ca516c19d688ab784 Azure: [FAILURE](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=19496) * 98d5cfb130ae7ced0a32be01ff024afc9684d67a UNKNOWN Bot commands @hudi-bot supports the following commands: - `@hudi-bot run azure` re-run the last Azure build -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] hudi-bot commented on pull request #9572: [WIP][HUDI-6702]Utilize merger to replace insertValue api
hudi-bot commented on PR #9572: URL: https://github.com/apache/hudi/pull/9572#issuecomment-1700014321 ## CI report: * ad05887b523496f59ac8b6e976183d6c325ed94d UNKNOWN * 7e769e60b101466c27604ce531b95f42eab87885 Azure: [FAILURE](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=19546) * cf848446b9c837be3c1c2fdc7930b26f920a0754 UNKNOWN Bot commands @hudi-bot supports the following commands: - `@hudi-bot run azure` re-run the last Azure build -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] yihua commented on a diff in pull request #9572: [WIP][HUDI-6702]Utilize merger to replace insertValue api
yihua commented on code in PR #9572: URL: https://github.com/apache/hudi/pull/9572#discussion_r1310958195 ## hudi-client/hudi-client-common/src/main/java/org/apache/hudi/common/table/log/HoodieFileSliceReader.java: ## @@ -38,12 +36,11 @@ public class HoodieFileSliceReader implements Iterator> { private final Iterator> recordsIterator; public static HoodieFileSliceReader getFileSliceReader( - Option baseFileReader, HoodieMergedLogRecordScanner scanner, Schema schema, Properties props, Option> simpleKeyGenFieldsOpt) throws IOException { + Option baseFileReader, HoodieMergedLogRecordScanner scanner, Schema schema) throws IOException { if (baseFileReader.isPresent()) { Iterator baseIterator = baseFileReader.get().getRecordIterator(schema); while (baseIterator.hasNext()) { - scanner.processNextRecord(baseIterator.next().wrapIntoHoodieRecordPayloadWithParams(schema, props, -simpleKeyGenFieldsOpt, scanner.isWithOperationField(), scanner.getPartitionNameOverride(), false, Option.empty())); +scanner.processNextRecord(baseIterator.next()); Review Comment: I think here we still need to call `wrapIntoHoodieRecordPayloadWithParams` to properly fill record key and partition path values. Could you double check? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] yihua commented on a diff in pull request #9572: [WIP][HUDI-6702]Utilize merger to replace insertValue api
yihua commented on code in PR #9572: URL: https://github.com/apache/hudi/pull/9572#discussion_r1310956732 ## hudi-common/src/main/java/org/apache/hudi/common/model/HoodieAvroRecord.java: ## @@ -184,16 +185,20 @@ public HoodieRecord copy() { @Override public HoodieRecord wrapIntoHoodieRecordPayloadWithParams( - Schema recordSchema, Properties props, + Schema recordSchema, + Properties props, Option> simpleKeyGenFieldsOpt, Boolean withOperation, Option partitionNameOp, Boolean populateMetaFields, Option schemaWithoutMetaFields) throws IOException { -IndexedRecord indexedRecord = (IndexedRecord) data.getInsertValue(recordSchema, props).get(); +HoodieAvroRecordMerger merger = HoodieAvroRecordMerger.INSTANCE; +HoodieAvroIndexedRecord record = (HoodieAvroIndexedRecord) merger.merge( Review Comment: Let's change the merge API to ``` Option> merge(Option older, Schema oldSchema, Option newer, Schema newSchema, TypedProperties props) ``` so that we pass in `Option.empty` instead of `null`. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] CTTY commented on pull request #9221: [HUDI-6550] Add Hadoop conf to HiveConf for HiveSyncConfig
CTTY commented on PR #9221: URL: https://github.com/apache/hudi/pull/9221#issuecomment-1699982042 Hey @xushiyan, I've tested this fix manually with EMR Serverless and it works fine -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[jira] [Updated] (HUDI-6702) Extend merge API to support all merging operations
[ https://issues.apache.org/jira/browse/HUDI-6702?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] ASF GitHub Bot updated HUDI-6702: - Labels: pull-request-available (was: ) > Extend merge API to support all merging operations > -- > > Key: HUDI-6702 > URL: https://issues.apache.org/jira/browse/HUDI-6702 > Project: Apache Hudi > Issue Type: New Feature >Reporter: Sagar Sumit >Assignee: Lin Liu >Priority: Blocker > Labels: pull-request-available > Fix For: 1.0.0 > > > See this issue for more details- [https://github.com/apache/hudi/issues/9430] > We may have to introduce a new API or figure out a way for the current merger > to skip empty records. -- This message was sent by Atlassian Jira (v8.20.10#820010)
[GitHub] [hudi] linliu-code commented on a diff in pull request #9572: [WIP][HUDI-6702]Utilize merger to replace insertValue api
linliu-code commented on code in PR #9572: URL: https://github.com/apache/hudi/pull/9572#discussion_r1310932741 ## hudi-common/src/main/java/org/apache/hudi/common/model/HoodieAvroRecord.java: ## @@ -184,16 +185,20 @@ public HoodieRecord copy() { @Override public HoodieRecord wrapIntoHoodieRecordPayloadWithParams( - Schema recordSchema, Properties props, + Schema recordSchema, + Properties props, Option> simpleKeyGenFieldsOpt, Boolean withOperation, Option partitionNameOp, Boolean populateMetaFields, Option schemaWithoutMetaFields) throws IOException { -IndexedRecord indexedRecord = (IndexedRecord) data.getInsertValue(recordSchema, props).get(); +HoodieAvroRecordMerger merger = HoodieAvroRecordMerger.INSTANCE; Review Comment: Hi Danny, what do you mean by "inline"? Do you mean we should pass the merger from the parameters? Meanwhile, how do we deprecate this function? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] hudi-bot commented on pull request #9564: [HUDI-6712] Add Parquet file metadata loader
hudi-bot commented on PR #9564: URL: https://github.com/apache/hudi/pull/9564#issuecomment-1699927341 ## CI report: * c54656897cae544738e30fe42a0fb684787ad704 Azure: [FAILURE](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=19557) * f6a01d87b32aebfce310375b8925f4802acca686 Azure: [PENDING](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=19560) Bot commands @hudi-bot supports the following commands: - `@hudi-bot run azure` re-run the last Azure build -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] hudi-bot commented on pull request #9571: Enabling comprehensive schema evolution in delta streamer code
hudi-bot commented on PR #9571: URL: https://github.com/apache/hudi/pull/9571#issuecomment-1699927529 ## CI report: * 070278982fdd12e8f708ea22cbfc641b41d2cfc7 Azure: [SUCCESS](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=19543) * 3af6011d72b294b0995d52be40a6d91e6eff9a1b Azure: [PENDING](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=19561) Bot commands @hudi-bot supports the following commands: - `@hudi-bot run azure` re-run the last Azure build -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] hudi-bot commented on pull request #9538: [HUDI-6738] - Apply object filter before checkpoint batching in GcsEventsHoodieIncrSource
hudi-bot commented on PR #9538: URL: https://github.com/apache/hudi/pull/9538#issuecomment-1699926967 ## CI report: * 1c87979b57e306970bcc95530f45586badcf0a6a Azure: [FAILURE](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=19536) * d8d12bf0d3d2c24b0f03be4faf4c293c70db9ecd Azure: [PENDING](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=19559) Bot commands @hudi-bot supports the following commands: - `@hudi-bot run azure` re-run the last Azure build -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] hudi-bot commented on pull request #9571: Enabling comprehensive schema evolution in delta streamer code
hudi-bot commented on PR #9571: URL: https://github.com/apache/hudi/pull/9571#issuecomment-1699916156 ## CI report: * 070278982fdd12e8f708ea22cbfc641b41d2cfc7 Azure: [SUCCESS](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=19543) * 3af6011d72b294b0995d52be40a6d91e6eff9a1b UNKNOWN Bot commands @hudi-bot supports the following commands: - `@hudi-bot run azure` re-run the last Azure build -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] hudi-bot commented on pull request #9564: [HUDI-6712] Add Parquet file metadata loader
hudi-bot commented on PR #9564: URL: https://github.com/apache/hudi/pull/9564#issuecomment-1699915931 ## CI report: * c54656897cae544738e30fe42a0fb684787ad704 Azure: [FAILURE](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=19557) * f6a01d87b32aebfce310375b8925f4802acca686 UNKNOWN Bot commands @hudi-bot supports the following commands: - `@hudi-bot run azure` re-run the last Azure build -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[jira] [Commented] (HUDI-6771) Support Bloom Filter in Keyed Lookup Reader
[ https://issues.apache.org/jira/browse/HUDI-6771?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17760639#comment-17760639 ] Lin Liu commented on HUDI-6771: --- Have added the bloom filter support in the lookup reader, and added a unit test. > Support Bloom Filter in Keyed Lookup Reader > --- > > Key: HUDI-6771 > URL: https://issues.apache.org/jira/browse/HUDI-6771 > Project: Apache Hudi > Issue Type: New Feature >Reporter: Lin Liu >Priority: Major > > Support bloom filters for this reader. -- This message was sent by Atlassian Jira (v8.20.10#820010)
[GitHub] [hudi] hudi-bot commented on pull request #9538: [HUDI-6738] - Apply object filter before checkpoint batching in GcsEventsHoodieIncrSource
hudi-bot commented on PR #9538: URL: https://github.com/apache/hudi/pull/9538#issuecomment-1699915450 ## CI report: * 1c87979b57e306970bcc95530f45586badcf0a6a Azure: [FAILURE](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=19536) * d8d12bf0d3d2c24b0f03be4faf4c293c70db9ecd UNKNOWN Bot commands @hudi-bot supports the following commands: - `@hudi-bot run azure` re-run the last Azure build -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[jira] [Commented] (HUDI-6766) Fixing mysql debezium data loss
[ https://issues.apache.org/jira/browse/HUDI-6766?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17760635#comment-17760635 ] Sandeep Parwal commented on HUDI-6766: -- PR with the fix: [https://github.com/apache/hudi/pull/9475] > Fixing mysql debezium data loss > --- > > Key: HUDI-6766 > URL: https://issues.apache.org/jira/browse/HUDI-6766 > Project: Apache Hudi > Issue Type: Bug > Components: deltastreamer >Reporter: Danny Chen >Priority: Major > Fix For: 1.0.0 > > -- This message was sent by Atlassian Jira (v8.20.10#820010)
[jira] [Assigned] (HUDI-6795) Implement generation of record_positions for updates and deletes on write path
[ https://issues.apache.org/jira/browse/HUDI-6795?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] Ethan Guo reassigned HUDI-6795: --- Assignee: Ethan Guo > Implement generation of record_positions for updates and deletes on write path > -- > > Key: HUDI-6795 > URL: https://issues.apache.org/jira/browse/HUDI-6795 > Project: Apache Hudi > Issue Type: New Feature >Reporter: Ethan Guo >Assignee: Ethan Guo >Priority: Blocker > Fix For: 1.0.0 > > -- This message was sent by Atlassian Jira (v8.20.10#820010)
[hudi] branch master updated: [HUDI-6445] Fixing metrics to use IN-MEMORY type in tests (#9543)
This is an automated email from the ASF dual-hosted git repository. yihua pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/hudi.git The following commit(s) were added to refs/heads/master by this push: new 45d8290c80a [HUDI-6445] Fixing metrics to use IN-MEMORY type in tests (#9543) 45d8290c80a is described below commit 45d8290c80a3afd604331a4e67ab83490cf305dc Author: Sivabalan Narayanan AuthorDate: Wed Aug 30 17:39:54 2023 -0400 [HUDI-6445] Fixing metrics to use IN-MEMORY type in tests (#9543) --- .../test/java/org/apache/hudi/testutils/TestHoodieMetadataBase.java | 6 ++ .../apache/hudi/metadata/SparkHoodieBackedTableMetadataWriter.java | 3 ++- .../org/apache/hudi/client/functional/TestHoodieMetadataBase.java | 6 ++ 3 files changed, 6 insertions(+), 9 deletions(-) diff --git a/hudi-client/hudi-java-client/src/test/java/org/apache/hudi/testutils/TestHoodieMetadataBase.java b/hudi-client/hudi-java-client/src/test/java/org/apache/hudi/testutils/TestHoodieMetadataBase.java index 84e6c2cbabf..94a25c12bd6 100644 --- a/hudi-client/hudi-java-client/src/test/java/org/apache/hudi/testutils/TestHoodieMetadataBase.java +++ b/hudi-client/hudi-java-client/src/test/java/org/apache/hudi/testutils/TestHoodieMetadataBase.java @@ -35,12 +35,12 @@ import org.apache.hudi.config.HoodieCompactionConfig; import org.apache.hudi.config.HoodieIndexConfig; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.config.metrics.HoodieMetricsConfig; -import org.apache.hudi.config.metrics.HoodieMetricsGraphiteConfig; import org.apache.hudi.index.HoodieIndex; import org.apache.hudi.metadata.HoodieMetadataWriteUtils; import org.apache.hudi.metadata.HoodieTableMetadata; import org.apache.hudi.metadata.HoodieTableMetadataWriter; import org.apache.hudi.metadata.JavaHoodieBackedTableMetadataWriter; +import org.apache.hudi.metrics.MetricsReporterType; import org.apache.hudi.table.HoodieJavaTable; import org.apache.hudi.table.HoodieTable; @@ -303,9 +303,7 @@ public class TestHoodieMetadataBase extends HoodieJavaClientTestHarness { .ignoreSpuriousDeletes(validateMetadataPayloadConsistency) .build()) .withMetricsConfig(HoodieMetricsConfig.newBuilder().on(enableMetrics) -.withExecutorMetrics(enableMetrics).build()) -.withMetricsGraphiteConfig(HoodieMetricsGraphiteConfig.newBuilder() -.usePrefix("unit-test").build()) + .withExecutorMetrics(enableMetrics).withReporterType(MetricsReporterType.INMEMORY.name()).build()) .withRollbackUsingMarkers(useRollbackUsingMarkers) .withProperties(properties); } diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/metadata/SparkHoodieBackedTableMetadataWriter.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/metadata/SparkHoodieBackedTableMetadataWriter.java index f01547e01a9..15b527a0fe3 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/metadata/SparkHoodieBackedTableMetadataWriter.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/metadata/SparkHoodieBackedTableMetadataWriter.java @@ -33,6 +33,7 @@ import org.apache.hudi.common.util.Option; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.data.HoodieJavaRDD; import org.apache.hudi.metrics.DistributedRegistry; +import org.apache.hudi.metrics.MetricsReporterType; import org.apache.hadoop.conf.Configuration; import org.apache.spark.api.java.JavaRDD; @@ -98,7 +99,7 @@ public class SparkHoodieBackedTableMetadataWriter extends HoodieBackedTableMetad protected void initRegistry() { if (metadataWriteConfig.isMetricsOn()) { Registry registry; - if (metadataWriteConfig.isExecutorMetricsEnabled()) { + if (metadataWriteConfig.isExecutorMetricsEnabled() && metadataWriteConfig.getMetricsReporterType() != MetricsReporterType.INMEMORY) { registry = Registry.getRegistry("HoodieMetadata", DistributedRegistry.class.getName()); HoodieSparkEngineContext sparkEngineContext = (HoodieSparkEngineContext) engineContext; ((DistributedRegistry) registry).register(sparkEngineContext.getJavaSparkContext()); diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/functional/TestHoodieMetadataBase.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/functional/TestHoodieMetadataBase.java index b9fbeab2582..955ac9c3d03 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/functional/TestHoodieMetadataBase.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/functional/TestHoodieMetadataBase.java @@ -35,12 +35,12 @@ import org.apache.hudi.config.HoodieCompactionConfig; import org.apache.hudi.config.HoodieIndexConfig; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.config.metrics.HoodieMetricsConfig; -impo
[GitHub] [hudi] lokesh-lingarajan-0310 commented on a diff in pull request #9473: [HUDI-6724] - Defaulting previous Instant time to init time to enable full read of initial commit
lokesh-lingarajan-0310 commented on code in PR #9473: URL: https://github.com/apache/hudi/pull/9473#discussion_r1310856956 ## hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/helpers/IncrSourceHelper.java: ## @@ -130,7 +130,7 @@ public static QueryInfo generateQueryInfo(JavaSparkContext jssc, String srcBaseP } }); -String previousInstantTime = beginInstantTime; +String previousInstantTime = DEFAULT_BEGIN_TIMESTAMP; if (!beginInstantTime.equals(DEFAULT_BEGIN_TIMESTAMP)) { Review Comment: No -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[hudi] branch master updated: [HUDI-3727] Add metrics for async indexer (#9559)
This is an automated email from the ASF dual-hosted git repository. yihua pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/hudi.git The following commit(s) were added to refs/heads/master by this push: new a898dfd4152 [HUDI-3727] Add metrics for async indexer (#9559) a898dfd4152 is described below commit a898dfd415202bee85442382502b311626fb65da Author: Sagar Sumit AuthorDate: Thu Aug 31 03:04:01 2023 +0530 [HUDI-3727] Add metrics for async indexer (#9559) --- .../apache/hudi/metadata/HoodieMetadataWriteUtils.java | 1 - .../hudi/table/action/index/RunIndexActionExecutor.java | 16 +++- .../org/apache/hudi/metadata/HoodieMetadataMetrics.java | 3 ++- 3 files changed, 17 insertions(+), 3 deletions(-) diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metadata/HoodieMetadataWriteUtils.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metadata/HoodieMetadataWriteUtils.java index 2078896987d..e73f6fb7bc3 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metadata/HoodieMetadataWriteUtils.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metadata/HoodieMetadataWriteUtils.java @@ -68,7 +68,6 @@ public class HoodieMetadataWriteUtils { // eventually depend on the number of file groups selected for each partition (See estimateFileGroupCount function) private static final long MDT_MAX_HFILE_SIZE_BYTES = 10 * 1024 * 1024 * 1024L; // 10GB - /** * Create a {@code HoodieWriteConfig} to use for the Metadata Table. This is used by async * indexer only. diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/index/RunIndexActionExecutor.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/index/RunIndexActionExecutor.java index 9b91167899c..461c525a1d5 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/index/RunIndexActionExecutor.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/index/RunIndexActionExecutor.java @@ -27,6 +27,7 @@ import org.apache.hudi.avro.model.HoodieRestoreMetadata; import org.apache.hudi.avro.model.HoodieRollbackMetadata; import org.apache.hudi.client.transaction.TransactionManager; import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.metrics.Registry; import org.apache.hudi.common.model.HoodieCommitMetadata; import org.apache.hudi.common.table.HoodieTableConfig; import org.apache.hudi.common.table.HoodieTableMetaClient; @@ -35,11 +36,13 @@ import org.apache.hudi.common.table.timeline.HoodieTimeline; import org.apache.hudi.common.table.timeline.TimelineMetadataUtils; import org.apache.hudi.common.util.CleanerUtils; import org.apache.hudi.common.util.CollectionUtils; +import org.apache.hudi.common.util.HoodieTimer; import org.apache.hudi.common.util.Option; import org.apache.hudi.common.util.StringUtils; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.exception.HoodieIndexException; import org.apache.hudi.exception.HoodieMetadataException; +import org.apache.hudi.metadata.HoodieMetadataMetrics; import org.apache.hudi.metadata.HoodieTableMetadataWriter; import org.apache.hudi.metadata.MetadataPartitionType; import org.apache.hudi.table.HoodieTable; @@ -90,6 +93,8 @@ public class RunIndexActionExecutor extends BaseActionExecutor metrics; + // we use this to update the latest instant in data timeline that has been indexed in metadata table // this needs to be volatile as it can be updated in the IndexingCheckTask spawned by this executor // assumption is that only one indexer can execute at a time @@ -100,6 +105,11 @@ public class RunIndexActionExecutor extends BaseActionExecutor table, String instantTime) { super(context, config, table, instantTime); this.txnManager = new TransactionManager(config, table.getMetaClient().getFs()); +if (config.getMetadataConfig().enableMetrics()) { + this.metrics = Option.of(new HoodieMetadataMetrics(Registry.getRegistry("HoodieIndexer"))); +} else { + this.metrics = Option.empty(); +} } @Override @@ -143,7 +153,9 @@ public class RunIndexActionExecutor extends BaseActionExecutor m.updateMetrics(HoodieMetadataMetrics.INITIALIZE_STR, timer.endTimer())); // get remaining instants to catchup List instantsToCatchup = getInstantsToCatchup(indexUptoInstant); @@ -167,7 +179,7 @@ public class RunIndexActionExecutor extends BaseActionExecutor entry.getMetadataPartitionPath()).collect(Collectors.toList()).toArray())); + .map(entry -> entry.getMetadataPartitionPath()).collect(Collectors.toList()).toArray())); } } else { String indexUptoInstant = fileIndexPartitionInfo.getIndexUptoInstant(); @@ -275,7 +287,9 @@ public class RunIndexActionExecutor extends BaseActionExecutor m.u
[GitHub] [hudi] yihua merged pull request #9559: [HUDI-3727] Add metrics for async indexer
yihua merged PR #9559: URL: https://github.com/apache/hudi/pull/9559 -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[jira] [Comment Edited] (HUDI-6752) Scope out the work for file group reading and writing with record merging in Spark
[ https://issues.apache.org/jira/browse/HUDI-6752?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17760631#comment-17760631 ] Ethan Guo edited comment on HUDI-6752 at 8/30/23 9:32 PM: -- I've create JIRA tickets in the corresponding EPICs and here's the scope: EPIC HUDI-3217: (15pt P0) Finalize RecordMerger API for use with java, python and other languages. EPIC HUDI-6243: (17pt) Engine agnostic FileGroupReader "internal" API, replaces Spark and Hive reads. EPIC HUDI-6722: (22pt) Positional update, delete, partial update, event_time based merge and custom merger support on read and write paths. EPIC HUDI-6243: (27pt) Spark MoR Snapshot, Incremental, ReadOptimized, CDC, TimeTravel queries on new storage format. was (Author: guoyihua): I've create JIRA tickets in the corresponding EPICs and here's the scope: EPIC HUDI-3217: (15pt P0) Finalize RecordMerger API for use with java, python and other languages. EPIC HUDI-6243: (17pt) Engine agnostic FileGroupReader "internal" API, replaces Spark and Hive reads. EPIC HUDI-6722: (22pt) Positional update, delete, partial update, event_time based merge and custom merger support on read and write paths. EPIC HUDI-6243: (27pt) Spark MoR Snapshot, Incremental, ReadOptimized, CDC, TimeTravel queries on new storage format. > Scope out the work for file group reading and writing with record merging in > Spark > -- > > Key: HUDI-6752 > URL: https://issues.apache.org/jira/browse/HUDI-6752 > Project: Apache Hudi > Issue Type: Task >Reporter: Ethan Guo >Assignee: Ethan Guo >Priority: Major > Fix For: 1.0.0 > > -- This message was sent by Atlassian Jira (v8.20.10#820010)
[GitHub] [hudi] yihua commented on pull request #9559: [HUDI-3727] Add metrics for async indexer
yihua commented on PR #9559: URL: https://github.com/apache/hudi/pull/9559#issuecomment-1699873087 CI times out on the fourth job, which looks irrelevant. Merging this PR. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[jira] [Commented] (HUDI-6752) Scope out the work for file group reading and writing with record merging in Spark
[ https://issues.apache.org/jira/browse/HUDI-6752?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17760631#comment-17760631 ] Ethan Guo commented on HUDI-6752: - I've create JIRA tickets in the corresponding EPICs and here's the scope: EPIC HUDI-3217: (15pt P0) Finalize RecordMerger API for use with java, python and other languages. EPIC HUDI-6243: (17pt) Engine agnostic FileGroupReader "internal" API, replaces Spark and Hive reads. EPIC HUDI-6722: (22pt) Positional update, delete, partial update, event_time based merge and custom merger support on read and write paths. EPIC HUDI-6243: (27pt) Spark MoR Snapshot, Incremental, ReadOptimized, CDC, TimeTravel queries on new storage format. > Scope out the work for file group reading and writing with record merging in > Spark > -- > > Key: HUDI-6752 > URL: https://issues.apache.org/jira/browse/HUDI-6752 > Project: Apache Hudi > Issue Type: Task >Reporter: Ethan Guo >Assignee: Ethan Guo >Priority: Major > Fix For: 1.0.0 > > -- This message was sent by Atlassian Jira (v8.20.10#820010)
[jira] [Updated] (HUDI-6793) Support time-travel read in engine-agnostic FileGroupReader
[ https://issues.apache.org/jira/browse/HUDI-6793?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] Ethan Guo updated HUDI-6793: Priority: Blocker (was: Major) > Support time-travel read in engine-agnostic FileGroupReader > --- > > Key: HUDI-6793 > URL: https://issues.apache.org/jira/browse/HUDI-6793 > Project: Apache Hudi > Issue Type: New Feature >Reporter: Ethan Guo >Priority: Blocker > Fix For: 1.0.0 > > -- This message was sent by Atlassian Jira (v8.20.10#820010)
[jira] [Updated] (HUDI-6799) Integrate FileGroupReader with merge handle on the write path
[ https://issues.apache.org/jira/browse/HUDI-6799?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] Ethan Guo updated HUDI-6799: Priority: Blocker (was: Major) > Integrate FileGroupReader with merge handle on the write path > -- > > Key: HUDI-6799 > URL: https://issues.apache.org/jira/browse/HUDI-6799 > Project: Apache Hudi > Issue Type: New Feature >Reporter: Ethan Guo >Priority: Blocker > Fix For: 1.0.0 > > -- This message was sent by Atlassian Jira (v8.20.10#820010)
[jira] [Updated] (HUDI-6790) Support incremental read in engine-agnostic FileGroupReader
[ https://issues.apache.org/jira/browse/HUDI-6790?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] Ethan Guo updated HUDI-6790: Priority: Blocker (was: Major) > Support incremental read in engine-agnostic FileGroupReader > --- > > Key: HUDI-6790 > URL: https://issues.apache.org/jira/browse/HUDI-6790 > Project: Apache Hudi > Issue Type: New Feature >Reporter: Ethan Guo >Priority: Blocker > Fix For: 1.0.0 > > -- This message was sent by Atlassian Jira (v8.20.10#820010)
[jira] [Updated] (HUDI-6794) Support completion-time-based file slice in FileGroupReader
[ https://issues.apache.org/jira/browse/HUDI-6794?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] Ethan Guo updated HUDI-6794: Priority: Blocker (was: Major) > Support completion-time-based file slice in FileGroupReader > --- > > Key: HUDI-6794 > URL: https://issues.apache.org/jira/browse/HUDI-6794 > Project: Apache Hudi > Issue Type: New Feature >Reporter: Ethan Guo >Priority: Blocker > Fix For: 1.0.0 > > -- This message was sent by Atlassian Jira (v8.20.10#820010)
[jira] [Updated] (HUDI-6788) Integrate FileGroupReader with MergeOnReadInputFormat for Flink
[ https://issues.apache.org/jira/browse/HUDI-6788?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] Ethan Guo updated HUDI-6788: Priority: Blocker (was: Major) > Integrate FileGroupReader with MergeOnReadInputFormat for Flink > --- > > Key: HUDI-6788 > URL: https://issues.apache.org/jira/browse/HUDI-6788 > Project: Apache Hudi > Issue Type: New Feature >Reporter: Ethan Guo >Priority: Blocker > Fix For: 1.0.0 > > -- This message was sent by Atlassian Jira (v8.20.10#820010)
[jira] [Updated] (HUDI-6800) Implement log writing with partial updates on the write path
[ https://issues.apache.org/jira/browse/HUDI-6800?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] Ethan Guo updated HUDI-6800: Priority: Blocker (was: Major) > Implement log writing with partial updates on the write path > > > Key: HUDI-6800 > URL: https://issues.apache.org/jira/browse/HUDI-6800 > Project: Apache Hudi > Issue Type: New Feature >Reporter: Ethan Guo >Priority: Blocker > Fix For: 1.0.0 > > -- This message was sent by Atlassian Jira (v8.20.10#820010)
[jira] [Updated] (HUDI-6787) Integrate FileGroupReader with HoodieMergeOnReadSnapshotReader and RealtimeCompactedRecordReader for Hive
[ https://issues.apache.org/jira/browse/HUDI-6787?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] Ethan Guo updated HUDI-6787: Priority: Blocker (was: Major) > Integrate FileGroupReader with HoodieMergeOnReadSnapshotReader and > RealtimeCompactedRecordReader for Hive > - > > Key: HUDI-6787 > URL: https://issues.apache.org/jira/browse/HUDI-6787 > Project: Apache Hudi > Issue Type: New Feature >Reporter: Ethan Guo >Priority: Blocker > Fix For: 1.0.0 > > -- This message was sent by Atlassian Jira (v8.20.10#820010)
[jira] [Updated] (HUDI-6791) Integrate FileGroupReader with NewHoodieParquetFileFormat for Spark CDC Query
[ https://issues.apache.org/jira/browse/HUDI-6791?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] Ethan Guo updated HUDI-6791: Priority: Blocker (was: Major) > Integrate FileGroupReader with NewHoodieParquetFileFormat for Spark CDC Query > - > > Key: HUDI-6791 > URL: https://issues.apache.org/jira/browse/HUDI-6791 > Project: Apache Hudi > Issue Type: New Feature >Reporter: Ethan Guo >Priority: Blocker > Fix For: 1.0.0 > > -- This message was sent by Atlassian Jira (v8.20.10#820010)
[jira] [Updated] (HUDI-6792) Integrate FileGroupReader with NewHoodieParquetFileFormat for Spark Incremental Query
[ https://issues.apache.org/jira/browse/HUDI-6792?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] Ethan Guo updated HUDI-6792: Priority: Blocker (was: Major) > Integrate FileGroupReader with NewHoodieParquetFileFormat for Spark > Incremental Query > - > > Key: HUDI-6792 > URL: https://issues.apache.org/jira/browse/HUDI-6792 > Project: Apache Hudi > Issue Type: New Feature >Reporter: Ethan Guo >Priority: Blocker > Fix For: 1.0.0 > > -- This message was sent by Atlassian Jira (v8.20.10#820010)
[jira] [Updated] (HUDI-6802) Use completion time in Spark FileIndex for listing
[ https://issues.apache.org/jira/browse/HUDI-6802?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] Ethan Guo updated HUDI-6802: Priority: Blocker (was: Major) > Use completion time in Spark FileIndex for listing > -- > > Key: HUDI-6802 > URL: https://issues.apache.org/jira/browse/HUDI-6802 > Project: Apache Hudi > Issue Type: New Feature >Reporter: Ethan Guo >Priority: Blocker > Fix For: 1.0.0 > > -- This message was sent by Atlassian Jira (v8.20.10#820010)
[jira] [Updated] (HUDI-6801) Implement merging of partial updates in FileGroupReader
[ https://issues.apache.org/jira/browse/HUDI-6801?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] Ethan Guo updated HUDI-6801: Priority: Blocker (was: Major) > Implement merging of partial updates in FileGroupReader > --- > > Key: HUDI-6801 > URL: https://issues.apache.org/jira/browse/HUDI-6801 > Project: Apache Hudi > Issue Type: New Feature >Reporter: Ethan Guo >Priority: Blocker > Fix For: 1.0.0 > > -- This message was sent by Atlassian Jira (v8.20.10#820010)
[jira] [Updated] (HUDI-6789) Support CDC read in engine-agnostic FileGroupReader
[ https://issues.apache.org/jira/browse/HUDI-6789?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] Ethan Guo updated HUDI-6789: Priority: Blocker (was: Major) > Support CDC read in engine-agnostic FileGroupReader > --- > > Key: HUDI-6789 > URL: https://issues.apache.org/jira/browse/HUDI-6789 > Project: Apache Hudi > Issue Type: New Feature >Reporter: Ethan Guo >Priority: Blocker > Fix For: 1.0.0 > > -- This message was sent by Atlassian Jira (v8.20.10#820010)
[jira] [Updated] (HUDI-6797) Implement position-based updates in FileGroupReader
[ https://issues.apache.org/jira/browse/HUDI-6797?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] Ethan Guo updated HUDI-6797: Priority: Blocker (was: Major) > Implement position-based updates in FileGroupReader > --- > > Key: HUDI-6797 > URL: https://issues.apache.org/jira/browse/HUDI-6797 > Project: Apache Hudi > Issue Type: New Feature >Reporter: Ethan Guo >Priority: Blocker > Fix For: 1.0.0 > > -- This message was sent by Atlassian Jira (v8.20.10#820010)
[jira] [Updated] (HUDI-6795) Implement generation of record_positions for updates and deletes on write path
[ https://issues.apache.org/jira/browse/HUDI-6795?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] Ethan Guo updated HUDI-6795: Priority: Blocker (was: Major) > Implement generation of record_positions for updates and deletes on write path > -- > > Key: HUDI-6795 > URL: https://issues.apache.org/jira/browse/HUDI-6795 > Project: Apache Hudi > Issue Type: New Feature >Reporter: Ethan Guo >Priority: Blocker > Fix For: 1.0.0 > > -- This message was sent by Atlassian Jira (v8.20.10#820010)
[jira] [Updated] (HUDI-6796) Implement position-based deletes in FileGroupReader
[ https://issues.apache.org/jira/browse/HUDI-6796?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] Ethan Guo updated HUDI-6796: Priority: Blocker (was: Major) > Implement position-based deletes in FileGroupReader > --- > > Key: HUDI-6796 > URL: https://issues.apache.org/jira/browse/HUDI-6796 > Project: Apache Hudi > Issue Type: New Feature >Reporter: Ethan Guo >Priority: Blocker > Fix For: 1.0.0 > > -- This message was sent by Atlassian Jira (v8.20.10#820010)
[jira] [Updated] (HUDI-6795) Implement generation of record_positions for updates and deletes on write path
[ https://issues.apache.org/jira/browse/HUDI-6795?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] Ethan Guo updated HUDI-6795: Status: In Progress (was: Open) > Implement generation of record_positions for updates and deletes on write path > -- > > Key: HUDI-6795 > URL: https://issues.apache.org/jira/browse/HUDI-6795 > Project: Apache Hudi > Issue Type: New Feature >Reporter: Ethan Guo >Priority: Major > Fix For: 1.0.0 > > -- This message was sent by Atlassian Jira (v8.20.10#820010)
[jira] [Updated] (HUDI-6798) Implement event-time-based merging mode in FileGroupReader
[ https://issues.apache.org/jira/browse/HUDI-6798?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] Ethan Guo updated HUDI-6798: Priority: Blocker (was: Major) > Implement event-time-based merging mode in FileGroupReader > -- > > Key: HUDI-6798 > URL: https://issues.apache.org/jira/browse/HUDI-6798 > Project: Apache Hudi > Issue Type: New Feature >Reporter: Ethan Guo >Priority: Blocker > Fix For: 1.0.0 > > -- This message was sent by Atlassian Jira (v8.20.10#820010)
[jira] [Updated] (HUDI-6791) Integrate FileGroupReader with NewHoodieParquetFileFormat for Spark CDC Query
[ https://issues.apache.org/jira/browse/HUDI-6791?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] Ethan Guo updated HUDI-6791: Story Points: 4 (was: 3) > Integrate FileGroupReader with NewHoodieParquetFileFormat for Spark CDC Query > - > > Key: HUDI-6791 > URL: https://issues.apache.org/jira/browse/HUDI-6791 > Project: Apache Hudi > Issue Type: New Feature >Reporter: Ethan Guo >Priority: Major > Fix For: 1.0.0 > > -- This message was sent by Atlassian Jira (v8.20.10#820010)
[jira] [Updated] (HUDI-6799) Integrate FileGroupReader with merge handle on the write path
[ https://issues.apache.org/jira/browse/HUDI-6799?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] Ethan Guo updated HUDI-6799: Fix Version/s: 1.0.0 > Integrate FileGroupReader with merge handle on the write path > -- > > Key: HUDI-6799 > URL: https://issues.apache.org/jira/browse/HUDI-6799 > Project: Apache Hudi > Issue Type: New Feature >Reporter: Ethan Guo >Priority: Major > Fix For: 1.0.0 > > -- This message was sent by Atlassian Jira (v8.20.10#820010)
[jira] [Updated] (HUDI-6785) Introduce an engine-agnostic FileGroupReader for snapshot read
[ https://issues.apache.org/jira/browse/HUDI-6785?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] Ethan Guo updated HUDI-6785: Fix Version/s: 1.0.0 > Introduce an engine-agnostic FileGroupReader for snapshot read > -- > > Key: HUDI-6785 > URL: https://issues.apache.org/jira/browse/HUDI-6785 > Project: Apache Hudi > Issue Type: New Feature >Reporter: Ethan Guo >Priority: Blocker > Fix For: 1.0.0 > > -- This message was sent by Atlassian Jira (v8.20.10#820010)
[jira] [Updated] (HUDI-6791) Integrate FileGroupReader with NewHoodieParquetFileFormat for Spark CDC Query
[ https://issues.apache.org/jira/browse/HUDI-6791?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] Ethan Guo updated HUDI-6791: Story Points: 3 > Integrate FileGroupReader with NewHoodieParquetFileFormat for Spark CDC Query > - > > Key: HUDI-6791 > URL: https://issues.apache.org/jira/browse/HUDI-6791 > Project: Apache Hudi > Issue Type: New Feature >Reporter: Ethan Guo >Priority: Major > Fix For: 1.0.0 > > -- This message was sent by Atlassian Jira (v8.20.10#820010)
[jira] [Closed] (HUDI-6752) Scope out the work for file group reading and writing with record merging in Spark
[ https://issues.apache.org/jira/browse/HUDI-6752?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] Ethan Guo closed HUDI-6752. --- Resolution: Fixed > Scope out the work for file group reading and writing with record merging in > Spark > -- > > Key: HUDI-6752 > URL: https://issues.apache.org/jira/browse/HUDI-6752 > Project: Apache Hudi > Issue Type: Task >Reporter: Ethan Guo >Assignee: Ethan Guo >Priority: Major > Fix For: 1.0.0 > > -- This message was sent by Atlassian Jira (v8.20.10#820010)
[GitHub] [hudi] nsivabalan closed pull request #9533: [HUDI-6445] Fixing metrics in tests
nsivabalan closed pull request #9533: [HUDI-6445] Fixing metrics in tests URL: https://github.com/apache/hudi/pull/9533 -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[jira] [Assigned] (HUDI-6807) MoR Incremental count queries trigger full scan of files in table
[ https://issues.apache.org/jira/browse/HUDI-6807?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] sivabalan narayanan reassigned HUDI-6807: - Assignee: sivabalan narayanan > MoR Incremental count queries trigger full scan of files in table > - > > Key: HUDI-6807 > URL: https://issues.apache.org/jira/browse/HUDI-6807 > Project: Apache Hudi > Issue Type: Bug >Reporter: Timothy Brown >Assignee: sivabalan narayanan >Priority: Major > > While running the `TestMORDataSource` datasource tests I saw that we > eventually call `HoodiePruneFileSourcePartitions` which will list all of the > files in the table instead of the files that are relevant to the incremental > query. Ideally this would be limited to the files that were impacted by > commits within the range specified. -- This message was sent by Atlassian Jira (v8.20.10#820010)
[jira] [Created] (HUDI-6807) MoR Incremental count queries trigger full scan of files in table
Timothy Brown created HUDI-6807: --- Summary: MoR Incremental count queries trigger full scan of files in table Key: HUDI-6807 URL: https://issues.apache.org/jira/browse/HUDI-6807 Project: Apache Hudi Issue Type: Bug Reporter: Timothy Brown While running the `TestMORDataSource` datasource tests I saw that we eventually call `HoodiePruneFileSourcePartitions` which will list all of the files in the table instead of the files that are relevant to the incremental query. Ideally this would be limited to the files that were impacted by commits within the range specified. -- This message was sent by Atlassian Jira (v8.20.10#820010)