[jira] [Commented] (HUDI-4813) Infer keygen not work in sparksql side
[ https://issues.apache.org/jira/browse/HUDI-4813?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=17605625#comment-17605625 ] Danny Chen commented on HUDI-4813: -- Fixed via master branch: 3faddb7da09e5e11d1b126ba49cea4ebdeba8fc7 > Infer keygen not work in sparksql side > -- > > Key: HUDI-4813 > URL: https://issues.apache.org/jira/browse/HUDI-4813 > Project: Apache Hudi > Issue Type: Bug >Reporter: JinxinTang >Priority: Major > Labels: pull-request-available > Fix For: 0.12.1 > > -- This message was sent by Atlassian Jira (v8.20.10#820010)
[jira] [Resolved] (HUDI-4813) Infer keygen not work in sparksql side
[ https://issues.apache.org/jira/browse/HUDI-4813?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] Danny Chen resolved HUDI-4813. -- > Infer keygen not work in sparksql side > -- > > Key: HUDI-4813 > URL: https://issues.apache.org/jira/browse/HUDI-4813 > Project: Apache Hudi > Issue Type: Bug >Reporter: JinxinTang >Priority: Major > Labels: pull-request-available > -- This message was sent by Atlassian Jira (v8.20.10#820010)
[jira] [Updated] (HUDI-4813) Infer keygen not work in sparksql side
[ https://issues.apache.org/jira/browse/HUDI-4813?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] Danny Chen updated HUDI-4813: - Fix Version/s: 0.12.1 > Infer keygen not work in sparksql side > -- > > Key: HUDI-4813 > URL: https://issues.apache.org/jira/browse/HUDI-4813 > Project: Apache Hudi > Issue Type: Bug >Reporter: JinxinTang >Priority: Major > Labels: pull-request-available > Fix For: 0.12.1 > > -- This message was sent by Atlassian Jira (v8.20.10#820010)
[hudi] branch master updated: [HUDI-4813] Fix infer keygen not work in sparksql side issue (#6634)
This is an automated email from the ASF dual-hosted git repository. danny0405 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/hudi.git The following commit(s) were added to refs/heads/master by this push: new 3faddb7da0 [HUDI-4813] Fix infer keygen not work in sparksql side issue (#6634) 3faddb7da0 is described below commit 3faddb7da09e5e11d1b126ba49cea4ebdeba8fc7 Author: FocusComputing AuthorDate: Fri Sep 16 13:58:05 2022 +0800 [HUDI-4813] Fix infer keygen not work in sparksql side issue (#6634) * [HUDI-4813] Fix infer keygen not work in sparksql side issue Co-authored-by: xiaoxingstack --- .../scala/org/apache/hudi/DataSourceOptions.scala | 10 ++- .../sql/catalyst/catalog/HoodieCatalogTable.scala | 9 ++- .../org/apache/hudi/TestHoodieSparkSqlWriter.scala | 2 +- .../apache/spark/sql/hudi/TestCreateTable.scala| 74 +- 4 files changed, 86 insertions(+), 9 deletions(-) diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/DataSourceOptions.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/DataSourceOptions.scala index c694174b8c..e8ffb09ff9 100644 --- a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/DataSourceOptions.scala +++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/DataSourceOptions.scala @@ -23,7 +23,7 @@ import org.apache.hudi.common.config.{ConfigProperty, DFSPropertiesConfiguration import org.apache.hudi.common.fs.ConsistencyGuardConfig import org.apache.hudi.common.model.{HoodieTableType, WriteOperationType} import org.apache.hudi.common.table.HoodieTableConfig -import org.apache.hudi.common.util.Option +import org.apache.hudi.common.util.{Option, StringUtils} import org.apache.hudi.common.util.ValidationUtils.checkState import org.apache.hudi.config.{HoodieClusteringConfig, HoodieWriteConfig} import org.apache.hudi.hive.{HiveSyncConfig, HiveSyncConfigHolder, HiveSyncTool} @@ -787,9 +787,13 @@ object DataSourceOptionsHelper { def inferKeyGenClazz(props: TypedProperties): String = { val partitionFields = props.getString(DataSourceWriteOptions.PARTITIONPATH_FIELD.key(), null) -if (partitionFields != null) { +val recordsKeyFields = props.getString(DataSourceWriteOptions.RECORDKEY_FIELD.key(), DataSourceWriteOptions.RECORDKEY_FIELD.defaultValue()) +inferKeyGenClazz(recordsKeyFields, partitionFields) + } + + def inferKeyGenClazz(recordsKeyFields: String, partitionFields: String): String = { +if (!StringUtils.isNullOrEmpty(partitionFields)) { val numPartFields = partitionFields.split(",").length - val recordsKeyFields = props.getString(DataSourceWriteOptions.RECORDKEY_FIELD.key(), DataSourceWriteOptions.RECORDKEY_FIELD.defaultValue()) val numRecordKeyFields = recordsKeyFields.split(",").length if (numPartFields == 1 && numRecordKeyFields == 1) { classOf[SimpleKeyGenerator].getName diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/catalyst/catalog/HoodieCatalogTable.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/catalyst/catalog/HoodieCatalogTable.scala index 09981e845a..f135772320 100644 --- a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/catalyst/catalog/HoodieCatalogTable.scala +++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/catalyst/catalog/HoodieCatalogTable.scala @@ -17,19 +17,19 @@ package org.apache.spark.sql.catalyst.catalog -import org.apache.hudi.AvroConversionUtils import org.apache.hudi.DataSourceWriteOptions.OPERATION import org.apache.hudi.HoodieWriterUtils._ import org.apache.hudi.common.config.DFSPropertiesConfiguration import org.apache.hudi.common.model.HoodieTableType import org.apache.hudi.common.table.{HoodieTableConfig, HoodieTableMetaClient} import org.apache.hudi.common.util.{StringUtils, ValidationUtils} -import org.apache.hudi.keygen.ComplexKeyGenerator import org.apache.hudi.keygen.factory.HoodieSparkKeyGeneratorFactory +import org.apache.hudi.{AvroConversionUtils, DataSourceOptionsHelper} import org.apache.spark.internal.Logging import org.apache.spark.sql.avro.SchemaConverters import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.hudi.HoodieOptionConfig +import org.apache.spark.sql.hudi.HoodieOptionConfig.SQL_KEY_TABLE_PRIMARY_KEY import org.apache.spark.sql.hudi.HoodieSqlCommonUtils._ import org.apache.spark.sql.types.{StructField, StructType} import org.apache.spark.sql.{AnalysisException, SparkSession} @@ -288,7 +288,10 @@ class HoodieCatalogTable(val spark: SparkSession, var table: CatalogTable) exten HoodieSparkKeyGeneratorFactory.convertToSparkKeyGenerator( originTableConfig(HoodieTableConfig.KEY_GENERATOR_CLASS_NAME.key)) } else { -
[GitHub] [hudi] danny0405 merged pull request #6634: [HUDI-4813] Fix infer keygen not work in sparksql side issue
danny0405 merged PR #6634: URL: https://github.com/apache/hudi/pull/6634 -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[jira] [Resolved] (HUDI-4853) Get field by name for OverwriteNonDefaultsWithLatestAvroPayload to avoid schema mismatch
[ https://issues.apache.org/jira/browse/HUDI-4853?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] Danny Chen resolved HUDI-4853. -- > Get field by name for OverwriteNonDefaultsWithLatestAvroPayload to avoid > schema mismatch > > > Key: HUDI-4853 > URL: https://issues.apache.org/jira/browse/HUDI-4853 > Project: Apache Hudi > Issue Type: Bug > Components: core >Affects Versions: 0.12.0 >Reporter: Danny Chen >Priority: Major > Labels: pull-request-available > Fix For: 0.12.1 > > -- This message was sent by Atlassian Jira (v8.20.10#820010)
[jira] [Commented] (HUDI-4853) Get field by name for OverwriteNonDefaultsWithLatestAvroPayload to avoid schema mismatch
[ https://issues.apache.org/jira/browse/HUDI-4853?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=17605624#comment-17605624 ] Danny Chen commented on HUDI-4853: -- Fixed via master branch: f70678f4354c6264b6a1e38900dd7a11cb345b96 > Get field by name for OverwriteNonDefaultsWithLatestAvroPayload to avoid > schema mismatch > > > Key: HUDI-4853 > URL: https://issues.apache.org/jira/browse/HUDI-4853 > Project: Apache Hudi > Issue Type: Bug > Components: core >Affects Versions: 0.12.0 >Reporter: Danny Chen >Priority: Major > Labels: pull-request-available > Fix For: 0.12.1 > > -- This message was sent by Atlassian Jira (v8.20.10#820010)
[hudi] branch master updated: [HUDI-4853] Get field by name for OverwriteNonDefaultsWithLatestAvroPayload to avoid schema mismatch (#6689)
This is an automated email from the ASF dual-hosted git repository. danny0405 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/hudi.git The following commit(s) were added to refs/heads/master by this push: new f70678f435 [HUDI-4853] Get field by name for OverwriteNonDefaultsWithLatestAvroPayload to avoid schema mismatch (#6689) f70678f435 is described below commit f70678f4354c6264b6a1e38900dd7a11cb345b96 Author: Danny Chan AuthorDate: Fri Sep 16 13:56:23 2022 +0800 [HUDI-4853] Get field by name for OverwriteNonDefaultsWithLatestAvroPayload to avoid schema mismatch (#6689) --- .../OverwriteNonDefaultsWithLatestAvroPayload.java | 2 +- ...tOverwriteNonDefaultsWithLatestAvroPayload.java | 59 -- 2 files changed, 57 insertions(+), 4 deletions(-) diff --git a/hudi-common/src/main/java/org/apache/hudi/common/model/OverwriteNonDefaultsWithLatestAvroPayload.java b/hudi-common/src/main/java/org/apache/hudi/common/model/OverwriteNonDefaultsWithLatestAvroPayload.java index 6ce99aae21..9ce241bc78 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/model/OverwriteNonDefaultsWithLatestAvroPayload.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/model/OverwriteNonDefaultsWithLatestAvroPayload.java @@ -70,7 +70,7 @@ public class OverwriteNonDefaultsWithLatestAvroPayload extends OverwriteWithLate if (!overwriteField(value, defaultValue)) { builder.set(field, value); } else { - builder.set(field, currentRecord.get(field.pos())); + builder.set(field, currentRecord.get(field.name())); } }); return Option.of(builder.build()); diff --git a/hudi-common/src/test/java/org/apache/hudi/common/model/TestOverwriteNonDefaultsWithLatestAvroPayload.java b/hudi-common/src/test/java/org/apache/hudi/common/model/TestOverwriteNonDefaultsWithLatestAvroPayload.java index 9e3405b304..0807b41f61 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/model/TestOverwriteNonDefaultsWithLatestAvroPayload.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/model/TestOverwriteNonDefaultsWithLatestAvroPayload.java @@ -23,6 +23,7 @@ import org.apache.avro.Schema; import org.apache.avro.generic.GenericData; import org.apache.avro.generic.GenericRecord; import org.apache.avro.generic.IndexedRecord; +import org.apache.hudi.avro.HoodieAvroUtils; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; @@ -54,13 +55,15 @@ public class TestOverwriteNonDefaultsWithLatestAvroPayload { @Test public void testActiveRecords() throws IOException { +Schema writerSchema = HoodieAvroUtils.addMetadataFields(schema); + GenericRecord record1 = new GenericData.Record(schema); record1.put("id", "1"); record1.put("partition", "partition1"); record1.put("ts", 0L); record1.put("_hoodie_is_deleted", false); record1.put("city", "NY0"); -record1.put("child", Arrays.asList("A")); +record1.put("child", Collections.singletonList("A")); GenericRecord record2 = new GenericData.Record(schema); record2.put("id", "2"); @@ -76,11 +79,38 @@ public class TestOverwriteNonDefaultsWithLatestAvroPayload { record3.put("ts", 1L); record3.put("_hoodie_is_deleted", false); record3.put("city", "NY0"); -record3.put("child", Arrays.asList("A")); - +record3.put("child", Collections.singletonList("A")); + +// same content with record1 plus metadata fields +GenericRecord record4 = createRecordWithMetadataFields(writerSchema, "1", "partition1"); +record4.put("id", "1"); +record4.put("partition", "partition1"); +record4.put("ts", 0L); +record4.put("_hoodie_is_deleted", false); +record4.put("city", "NY0"); +record4.put("child", Collections.singletonList("A")); + +// same content with record2 plus metadata fields +GenericRecord record5 = createRecordWithMetadataFields(writerSchema, "2", ""); +record5.put("id", "2"); +record5.put("partition", ""); +record5.put("ts", 1L); +record5.put("_hoodie_is_deleted", false); +record5.put("city", "NY"); +record5.put("child", Collections.emptyList()); + +// same content with record3 plus metadata fields +GenericRecord record6 = createRecordWithMetadataFields(writerSchema, "2", ""); +record6.put("id", "2"); +record6.put("partition", "partition1"); +record6.put("ts", 1L); +record6.put("_hoodie_is_deleted", false); +record6.put("city", "NY0"); +record6.put("child", Collections.singletonList("A")); OverwriteNonDefaultsWithLatestAvroPayload payload1 = new OverwriteNonDefaultsWithLatestAvroPayload(record1, 1); OverwriteNonDefaultsWithLatestAvroPayload payload2 = new OverwriteNonDefaultsWithLatestAvroPayload(record2, 2); +OverwriteNonDefaultsWithLatestAvroPayload payload5 = new OverwriteNonDefaultsWithLatestAvroPayload(record5, 2);
[GitHub] [hudi] danny0405 merged pull request #6689: [HUDI-4853] Get field by name for OverwriteNonDefaultsWithLatestAvroP…
danny0405 merged PR #6689: URL: https://github.com/apache/hudi/pull/6689 -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[jira] [Updated] (HUDI-4760) Clustering results in repeated triggers of clustering execution
[ https://issues.apache.org/jira/browse/HUDI-4760?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] Raymond Xu updated HUDI-4760: - Priority: Blocker (was: Major) > Clustering results in repeated triggers of clustering execution > --- > > Key: HUDI-4760 > URL: https://issues.apache.org/jira/browse/HUDI-4760 > Project: Apache Hudi > Issue Type: Bug > Components: clustering >Reporter: sivabalan narayanan >Assignee: sivabalan narayanan >Priority: Blocker > Labels: pull-request-available > Fix For: 0.12.1 > > > Looks like clustering is getting executed twice, atleast data files and > marker files are created twice (only diff is the write token) and later gets > reconciled w/ markers (finalize step). > > Reported by [https://github.com/apache/hudi/issues/6212] > -- This message was sent by Atlassian Jira (v8.20.10#820010)
[jira] [Updated] (HUDI-4724) add function of skip the _rt suffix for read snapshot
[ https://issues.apache.org/jira/browse/HUDI-4724?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] Raymond Xu updated HUDI-4724: - Status: Patch Available (was: In Progress) > add function of skip the _rt suffix for read snapshot > - > > Key: HUDI-4724 > URL: https://issues.apache.org/jira/browse/HUDI-4724 > Project: Apache Hudi > Issue Type: Improvement >Reporter: linfey.nie >Priority: Major > Labels: pull-request-available > > During Hive query, we usually use the original table name to write SQL. > Therefore, we need to skip the _rt suffix for read snapshot, the latest data > for calculation. -- This message was sent by Atlassian Jira (v8.20.10#820010)
[jira] [Updated] (HUDI-4724) add function of skip the _rt suffix for read snapshot
[ https://issues.apache.org/jira/browse/HUDI-4724?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] Raymond Xu updated HUDI-4724: - Status: In Progress (was: Open) > add function of skip the _rt suffix for read snapshot > - > > Key: HUDI-4724 > URL: https://issues.apache.org/jira/browse/HUDI-4724 > Project: Apache Hudi > Issue Type: Improvement >Reporter: linfey.nie >Priority: Major > Labels: pull-request-available > > During Hive query, we usually use the original table name to write SQL. > Therefore, we need to skip the _rt suffix for read snapshot, the latest data > for calculation. -- This message was sent by Atlassian Jira (v8.20.10#820010)
[GitHub] [hudi] xushiyan commented on a diff in pull request #6537: [HUDI-4762] Avoid update metastore schema if only missing column in input
xushiyan commented on code in PR #6537: URL: https://github.com/apache/hudi/pull/6537#discussion_r972634507 ## hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/HiveSyncTool.java: ## @@ -286,7 +286,11 @@ private boolean syncSchema(String tableName, boolean tableExists, boolean useRea config.getBooleanOrDefault(HIVE_SUPPORT_TIMESTAMP_TYPE)); if (!schemaDiff.isEmpty()) { LOG.info("Schema difference found for " + tableName); -syncClient.updateTableSchema(tableName, schema); +if (!schemaDiff.getAddColumnTypes().isEmpty() || !schemaDiff.getUpdateColumnTypes().isEmpty()) { Review Comment: we should always keep schema up to date. when later data written with old schema, https://hudi.apache.org/docs/configurations#hoodiedatasourcewritereconcileschema this config is to adapt the data into the new schema. so i don't think we should skip update schema -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[jira] [Updated] (HUDI-4854) Deltastreamer does not respect partition selector regex for metadata-only bootstrap
[ https://issues.apache.org/jira/browse/HUDI-4854?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] Ethan Guo updated HUDI-4854: Status: In Progress (was: Open) > Deltastreamer does not respect partition selector regex for metadata-only > bootstrap > --- > > Key: HUDI-4854 > URL: https://issues.apache.org/jira/browse/HUDI-4854 > Project: Apache Hudi > Issue Type: Bug > Components: bootstrap >Reporter: Ethan Guo >Assignee: Ethan Guo >Priority: Major > Fix For: 0.12.1 > > -- This message was sent by Atlassian Jira (v8.20.10#820010)
[GitHub] [hudi] prasannarajaperumal commented on a diff in pull request #6476: [HUDI-3478] Support CDC for Spark in Hudi
prasannarajaperumal commented on code in PR #6476: URL: https://github.com/apache/hudi/pull/6476#discussion_r972627223 ## hudi-common/src/main/java/org/apache/hudi/avro/SerializableRecord.java: ## @@ -0,0 +1,42 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.avro; + +import org.apache.avro.generic.GenericData; + +import java.io.Serializable; + +/** + * In some cases like putting the [[GenericData.Record]] into [[ExternalSpillableMap]], + * objects is asked to extend [[Serializable]]. + * + * This class wraps [[GenericData.Record]]. + */ +public class SerializableRecord implements Serializable { Review Comment: The serialization of external map is done by Kryo here. Is there a Serializer registered for Avro Schema with kyro?. I dont see the test for this with ExternalSpillableMap anymore - @YannByron - Can you check? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] prasannarajaperumal commented on a diff in pull request #6476: [HUDI-3478] Support CDC for Spark in Hudi
prasannarajaperumal commented on code in PR #6476: URL: https://github.com/apache/hudi/pull/6476#discussion_r972627223 ## hudi-common/src/main/java/org/apache/hudi/avro/SerializableRecord.java: ## @@ -0,0 +1,42 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.avro; + +import org.apache.avro.generic.GenericData; + +import java.io.Serializable; + +/** + * In some cases like putting the [[GenericData.Record]] into [[ExternalSpillableMap]], + * objects is asked to extend [[Serializable]]. + * + * This class wraps [[GenericData.Record]]. + */ +public class SerializableRecord implements Serializable { Review Comment: The serialization of external map is done by Kryo here. Is there a Serializer registered for Avro Schema with kyro?. I dont see the test for this in ExternalSpillableMap anymore - @YannByron - Can you check? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] prasannarajaperumal commented on a diff in pull request #6476: [HUDI-3478] Support CDC for Spark in Hudi
prasannarajaperumal commented on code in PR #6476: URL: https://github.com/apache/hudi/pull/6476#discussion_r972627223 ## hudi-common/src/main/java/org/apache/hudi/avro/SerializableRecord.java: ## @@ -0,0 +1,42 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.avro; + +import org.apache.avro.generic.GenericData; + +import java.io.Serializable; + +/** + * In some cases like putting the [[GenericData.Record]] into [[ExternalSpillableMap]], + * objects is asked to extend [[Serializable]]. + * + * This class wraps [[GenericData.Record]]. + */ +public class SerializableRecord implements Serializable { Review Comment: The serialization of external map is done by Kryo here. Is there a Serializer registered for Avro Schema with kyro? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] Zhangshunyu commented on issue #6691: [SUPPORT]Error after applyed HUDI-4851 for InSet
Zhangshunyu commented on issue #6691: URL: https://github.com/apache/hudi/issues/6691#issuecomment-1248933132 select count(*),count(distinct word) from test_table where word in ('HelloWorld', 'OK', ... etc. 1000 words here) -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] Zhangshunyu opened a new issue, #6691: [SUPPORT]Error while applyed HUDI-4851 for InSet
Zhangshunyu opened a new issue, #6691: URL: https://github.com/apache/hudi/issues/6691 ``` Caused by: java.lang.RuntimeException: Unsupported literal type class org.apache.spark.unsafe.types.UTF8String HelloWorld at org.apache.spark.sql.catalyst.expressions.Literal$.apply(literals.scala:95) at org.apache.spark.sql.hudi.DataSkippingUtils$.$anonfun$tryComposeIndexFilterExpr$35(DataSkippingUtils.scala) ``` after apply HUDI-4851 for InSet, we query the table thorws this error @alexeykudinkin could you pls have look at this issue? thank you! -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[hudi] branch asf-site updated: fix: blog image landing page (#6690)
This is an automated email from the ASF dual-hosted git repository. bhavanisudha pushed a commit to branch asf-site in repository https://gitbox.apache.org/repos/asf/hudi.git The following commit(s) were added to refs/heads/asf-site by this push: new 3b655aa086 fix: blog image landing page (#6690) 3b655aa086 is described below commit 3b655aa0861b93de188a88f58b104c503cab3964 Author: pintusoliya <37680791+pintusol...@users.noreply.github.com> AuthorDate: Fri Sep 16 10:44:38 2022 +0530 fix: blog image landing page (#6690) --- website/src/css/custom.css | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/src/css/custom.css b/website/src/css/custom.css index 87b508c719..fdae17d255 100644 --- a/website/src/css/custom.css +++ b/website/src/css/custom.css @@ -219,7 +219,7 @@ footer .container { left: 0; height:100%; width: 100%; - object-fit: cover; + object-fit: contain; } .tagRegular_node_modules-\@docusaurus-theme-classic-lib-next-theme-Tag-styles-module{
[GitHub] [hudi] bhasudha merged pull request #6690: [DOCS] fix: blog image landing page
bhasudha merged PR #6690: URL: https://github.com/apache/hudi/pull/6690 -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[jira] [Updated] (HUDI-4762) Hive sync update schema removes columns
[ https://issues.apache.org/jira/browse/HUDI-4762?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] Raymond Xu updated HUDI-4762: - Status: In Progress (was: Open) > Hive sync update schema removes columns > > > Key: HUDI-4762 > URL: https://issues.apache.org/jira/browse/HUDI-4762 > Project: Apache Hudi > Issue Type: Bug > Components: meta-sync >Reporter: nicolas paris >Assignee: nicolas paris >Priority: Critical > Labels: pull-request-available > Fix For: 0.12.1 > > > Currently when move a hudi table from schema1 to schema2 and then insert data > with the old schema1, then schema 2 is kept for the whole table. > This is not consistent with hive metastore which get its schema updated to > the old schema1. > Avoid update metastore schema if only missing column in input -- This message was sent by Atlassian Jira (v8.20.10#820010)
[jira] [Updated] (HUDI-4762) Hive sync update schema removes columns
[ https://issues.apache.org/jira/browse/HUDI-4762?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] Raymond Xu updated HUDI-4762: - Status: Patch Available (was: In Progress) > Hive sync update schema removes columns > > > Key: HUDI-4762 > URL: https://issues.apache.org/jira/browse/HUDI-4762 > Project: Apache Hudi > Issue Type: Bug > Components: meta-sync >Reporter: nicolas paris >Assignee: nicolas paris >Priority: Critical > Labels: pull-request-available > Fix For: 0.12.1 > > > Currently when move a hudi table from schema1 to schema2 and then insert data > with the old schema1, then schema 2 is kept for the whole table. > This is not consistent with hive metastore which get its schema updated to > the old schema1. > Avoid update metastore schema if only missing column in input -- This message was sent by Atlassian Jira (v8.20.10#820010)
[jira] [Closed] (HUDI-3861) 'path' in CatalogTable#properties failed to be updated when renaming table
[ https://issues.apache.org/jira/browse/HUDI-3861?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] Raymond Xu closed HUDI-3861. Resolution: Fixed > 'path' in CatalogTable#properties failed to be updated when renaming table > -- > > Key: HUDI-3861 > URL: https://issues.apache.org/jira/browse/HUDI-3861 > Project: Apache Hudi > Issue Type: Bug >Reporter: Jin Xing >Assignee: KnightChess >Priority: Critical > Labels: pull-request-available > Fix For: 0.12.1 > > > Reproduce the issue as below > {code:java} > 1. Create a MOR table > create table mor_simple( > id int, > name string, > price double > ) > using hudi > options ( > type = 'cow', > primaryKey = 'id' > ) > 2. Renaming > alter table mor_simple rename to mor_simple0 > 3. Show create table mor_simple0 > Output as > CREATE TABLE hudi.mor_simple0 ( > `_hoodie_commit_time` STRING, > `_hoodie_commit_seqno` STRING, > `_hoodie_record_key` STRING, > `_hoodie_partition_path` STRING, > `_hoodie_file_name` STRING, > `id` INT, > `name` STRING, > `price` DOUBLE) > USING hudi > OPTIONS( > 'primaryKey' = 'id', > 'type' = 'cow') > TBLPROPERTIES( > 'path' = '/user/hive/warehous/hudi.db/mor_simple'){code} > As we can see, the 'path' property is > '/user/hive/warehous/hudi.db/mor_simple', rather than > '/user/hive/warehous/hudi.db/mor_simple0'. > -- This message was sent by Atlassian Jira (v8.20.10#820010)
[hudi] branch master updated (bf64e60d31 -> c2b72306bd)
This is an automated email from the ASF dual-hosted git repository. xushiyan pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/hudi.git from bf64e60d31 [HUDI-4796] MetricsReporter stop bug (#6619) add c2b72306bd [HUDI-3861] update tblp 'path' when rename table (#5320) No new revisions were added by this update. Summary of changes: .../command/AlterHoodieTableRenameCommand.scala| 11 ++- .../org/apache/spark/sql/hudi/TestAlterTable.scala | 78 +- 2 files changed, 84 insertions(+), 5 deletions(-)
[GitHub] [hudi] xushiyan merged pull request #5320: [HUDI-3861] update tblp 'path' when rename table
xushiyan merged PR #5320: URL: https://github.com/apache/hudi/pull/5320 -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] pintusoliya opened a new pull request, #6690: fix: blog image landing page
pintusoliya opened a new pull request, #6690: URL: https://github.com/apache/hudi/pull/6690 ### Change Logs _Describe context and summary for this change. Highlight if any code was copied._ ### Impact _Describe any public API or user-facing feature change or any performance impact._ **Risk level: none | low | medium | high** _Choose one. If medium or high, explain what verification was done to mitigate the risks._ ### Contributor's checklist - [ ] Read through [contributor's guide](https://hudi.apache.org/contribute/how-to-contribute) - [ ] Change Logs and Impact were stated clearly - [ ] Adequate tests were added if applicable - [ ] CI passed -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] hudi-bot commented on pull request #6689: [HUDI-4853] Get field by name for OverwriteNonDefaultsWithLatestAvroP…
hudi-bot commented on PR #6689: URL: https://github.com/apache/hudi/pull/6689#issuecomment-1248921800 ## CI report: * cab4b6a3b31aff9a0aa4a825d341346aaa7ede73 Azure: [SUCCESS](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=11399) Bot commands @hudi-bot supports the following commands: - `@hudi-bot run azure` re-run the last Azure build -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] hudi-bot commented on pull request #6358: [HUDI-4588][HUDI-4472] Fixing `HoodieParquetReader` to properly specify projected schema when reading Parquet file
hudi-bot commented on PR #6358: URL: https://github.com/apache/hudi/pull/6358#issuecomment-1248921478 ## CI report: * 288d166c49602a4593b1e97763a467811903737d UNKNOWN * c4b6bb8dc7a4ddce5f729e5a49ac10aad25e8931 Azure: [FAILURE](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=11394) * 2f9e8ca8d6893e973883dadcab117597ee6badd3 Azure: [PENDING](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=11405) * 0f8adc10d644da86c1b070d4cdecfb28cd016e79 Azure: [PENDING](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=11411) * a0ebc2afda58895b220ef520d58cf2187a4edd1b UNKNOWN Bot commands @hudi-bot supports the following commands: - `@hudi-bot run azure` re-run the last Azure build -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] hudi-bot commented on pull request #3985: [HUDI-2754] Performance improvement for IncrementalRelation
hudi-bot commented on PR #3985: URL: https://github.com/apache/hudi/pull/3985#issuecomment-1248920688 ## CI report: * ccd1d89352a2f72feb381962718cc0c80920c041 UNKNOWN * dee3b8154dfc173ea70352986a7ebdd028a968b0 Azure: [SUCCESS](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=6003) * 6eaade5cab40d7eed0d67f06c820628c97b47528 Azure: [PENDING](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=11410) Bot commands @hudi-bot supports the following commands: - `@hudi-bot run azure` re-run the last Azure build -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] hudi-bot commented on pull request #6676: [HUDI-4453] Fix schema to include partition columns in bootstrap operation
hudi-bot commented on PR #6676: URL: https://github.com/apache/hudi/pull/6676#issuecomment-1248919461 ## CI report: * fa203bff2e2bb9fc27e50f0b0c2613770bfa5dc6 Azure: [FAILURE](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=11400) Bot commands @hudi-bot supports the following commands: - `@hudi-bot run azure` re-run the last Azure build -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] codope commented on a diff in pull request #6548: [HUDI-4749] Fixing full cleaning to leverage metadata table
codope commented on code in PR #6548: URL: https://github.com/apache/hudi/pull/6548#discussion_r972613010 ## hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/clean/CleanPlanner.java: ## @@ -206,15 +206,7 @@ private List getPartitionPathsForIncrementalCleaning(HoodieCleanMetadata */ private List getPartitionPathsForFullCleaning() { // Go to brute force mode of scanning all partitions -try { - // Because the partition of BaseTableMetadata has been deleted, - // all partition information can only be obtained from FileSystemBackedTableMetadata. Review Comment: @nsivabalan I think your suggestion makes sense given that we now do lazy cleaning. If `files` partition of metadata table exists then, most likely, the record is there with `isDeleted` set to true. The metadata based fs view should understand the [isDeleted](https://github.com/apache/hudi/blob/master/hudi-common/src/main/avro/HoodieMetadata.avsc#L50) flag of `HoodieMetadataFileInfo`. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] hudi-bot commented on pull request #6358: [HUDI-4588][HUDI-4472] Fixing `HoodieParquetReader` to properly specify projected schema when reading Parquet file
hudi-bot commented on PR #6358: URL: https://github.com/apache/hudi/pull/6358#issuecomment-1248919216 ## CI report: * 288d166c49602a4593b1e97763a467811903737d UNKNOWN * c4b6bb8dc7a4ddce5f729e5a49ac10aad25e8931 Azure: [FAILURE](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=11394) * 2f9e8ca8d6893e973883dadcab117597ee6badd3 Azure: [PENDING](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=11405) * 0f8adc10d644da86c1b070d4cdecfb28cd016e79 UNKNOWN Bot commands @hudi-bot supports the following commands: - `@hudi-bot run azure` re-run the last Azure build -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] hudi-bot commented on pull request #3985: [HUDI-2754] Performance improvement for IncrementalRelation
hudi-bot commented on PR #3985: URL: https://github.com/apache/hudi/pull/3985#issuecomment-1248918482 ## CI report: * ccd1d89352a2f72feb381962718cc0c80920c041 UNKNOWN * dee3b8154dfc173ea70352986a7ebdd028a968b0 Azure: [SUCCESS](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=6003) * 6eaade5cab40d7eed0d67f06c820628c97b47528 UNKNOWN Bot commands @hudi-bot supports the following commands: - `@hudi-bot run azure` re-run the last Azure build -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] hudi-bot commented on pull request #6688: Fix AWSDmsAvroPayload#combineAndGetUpdateValue when using MOR snapshot query after delete operations
hudi-bot commented on PR #6688: URL: https://github.com/apache/hudi/pull/6688#issuecomment-1248917319 ## CI report: * fff1405467fb5f6a7fdb6d3d043714e268f1c875 Azure: [SUCCESS](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=11398) Bot commands @hudi-bot supports the following commands: - `@hudi-bot run azure` re-run the last Azure build -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] hudi-bot commented on pull request #6670: [HUDI-4842] Support compression strategy based on delte file length
hudi-bot commented on PR #6670: URL: https://github.com/apache/hudi/pull/6670#issuecomment-1248891059 ## CI report: * 462f77736f855dc277cc62e0778fb4c1fa04f09a Azure: [SUCCESS](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=11357) * 9d0a67996db91ad90808ddb83aa281ad04e91a76 Azure: [PENDING](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=11409) Bot commands @hudi-bot supports the following commands: - `@hudi-bot run azure` re-run the last Azure build -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] hudi-bot commented on pull request #6670: [HUDI-4842] Support compression strategy based on delte file length
hudi-bot commented on PR #6670: URL: https://github.com/apache/hudi/pull/6670#issuecomment-1248889062 ## CI report: * 462f77736f855dc277cc62e0778fb4c1fa04f09a Azure: [SUCCESS](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=11357) * 9d0a67996db91ad90808ddb83aa281ad04e91a76 UNKNOWN Bot commands @hudi-bot supports the following commands: - `@hudi-bot run azure` re-run the last Azure build -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] hudi-bot commented on pull request #6677: [HUDI-4294][Stacked on 4293] Introduce build action to actually perform index data generation
hudi-bot commented on PR #6677: URL: https://github.com/apache/hudi/pull/6677#issuecomment-1248887133 ## CI report: * 0ce0aee73e1641f071abdfc44d4f5473a425befb Azure: [FAILURE](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=11375) * 5e4be0aff69d72a934a6b3e7f31a5be41d7d9ead Azure: [PENDING](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=11408) Bot commands @hudi-bot supports the following commands: - `@hudi-bot run azure` re-run the last Azure build -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] hudi-bot commented on pull request #5933: [HUDI-4293] Implement Create/Drop/Show/Refresh Index Command for Secondary Index
hudi-bot commented on PR #5933: URL: https://github.com/apache/hudi/pull/5933#issuecomment-1248886699 ## CI report: * 65359879df848d75b6693f4c313dc9453d635edd Azure: [SUCCESS](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=11370) * 7e66a3b8fb65f6e0a73c4f10c74d80034d0e888d Azure: [PENDING](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=11407) Bot commands @hudi-bot supports the following commands: - `@hudi-bot run azure` re-run the last Azure build -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] xushiyan commented on a diff in pull request #6662: [HUDI-4832] Fix drop partition meta sync
xushiyan commented on code in PR #6662: URL: https://github.com/apache/hudi/pull/6662#discussion_r972587570 ## hudi-sync/hudi-sync-common/src/main/java/org/apache/hudi/sync/common/HoodieSyncClient.java: ## @@ -158,4 +171,23 @@ public List getPartitionEvents(List tablePartitions, } return events; } + + /** + * Get Last commit's Metadata. + */ + private static Option getLatestCommitMetadata(HoodieTableMetaClient metaClient) { Review Comment: i think a more relevant logic we need for partition sync is: getCommitMetadataSinceLastSync(). it should return time-ordered commit metadata for extracting (partition, written|dropped) to be further produced partition events for sync -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] hudi-bot commented on pull request #6677: [HUDI-4294][Stacked on 4293] Introduce build action to actually perform index data generation
hudi-bot commented on PR #6677: URL: https://github.com/apache/hudi/pull/6677#issuecomment-1248884818 ## CI report: * 0ce0aee73e1641f071abdfc44d4f5473a425befb Azure: [FAILURE](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=11375) * 5e4be0aff69d72a934a6b3e7f31a5be41d7d9ead UNKNOWN Bot commands @hudi-bot supports the following commands: - `@hudi-bot run azure` re-run the last Azure build -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] hudi-bot commented on pull request #5933: [HUDI-4293] Implement Create/Drop/Show/Refresh Index Command for Secondary Index
hudi-bot commented on PR #5933: URL: https://github.com/apache/hudi/pull/5933#issuecomment-1248884303 ## CI report: * 65359879df848d75b6693f4c313dc9453d635edd Azure: [SUCCESS](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=11370) * 7e66a3b8fb65f6e0a73c4f10c74d80034d0e888d UNKNOWN Bot commands @hudi-bot supports the following commands: - `@hudi-bot run azure` re-run the last Azure build -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] TJX2014 commented on pull request #6630: [HUDI-4808] Fix HoodieSimpleBucketIndex not consider bucket num in lo…
TJX2014 commented on PR #6630: URL: https://github.com/apache/hudi/pull/6630#issuecomment-1248882064 Hi @danny0405 ci succeed. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] TJX2014 commented on pull request #6634: [HUDI-4813] Fix infer keygen not work in sparksql side issue
TJX2014 commented on PR #6634: URL: https://github.com/apache/hudi/pull/6634#issuecomment-1248881741 Hi @danny0405 ci passed : ) -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] nsivabalan commented on issue #6611: SchemaEvolution : Default value not getting fetched properly for not null columns from confluent kafka schema registry
nsivabalan commented on issue #6611: URL: https://github.com/apache/hudi/issues/6611#issuecomment-1248879098 I am yet to try this out locally and see how this pans out.bcoz, you have custom default value for strings. usually null defaults are taken into consideration. but non null defaults, I have not seen much. but for now, can you try disabling this config and see what happens. ``` hoodie.deltastreamer.schemaprovider.spark_avro_post_processor.enable ``` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] nsivabalan commented on issue #5249: [SUPPORT] Deltastreamer job does not terminate on Kubernetes when hoodie.metrics.on=true
nsivabalan commented on issue #5249: URL: https://github.com/apache/hudi/issues/5249#issuecomment-1248874327 -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] nsivabalan commented on issue #6606: Observing data duplication with Single Writer
nsivabalan commented on issue #6606: URL: https://github.com/apache/hudi/issues/6606#issuecomment-1248873001 you can read about multi writer guarantees here https://hudi.apache.org/docs/concurrency_control#multi-writer-guarantees -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] nsivabalan commented on issue #6606: Observing data duplication with Single Writer
nsivabalan commented on issue #6606: URL: https://github.com/apache/hudi/issues/6606#issuecomment-1248872759 here is what is happening. if there are two concurrent writers writing to non overlapping data files, hudi will succeed both writes. but if both are modifying the same data file, hudi will succeed one and will fail another write. and hence you are seeing conflict resolution failed. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] danny0405 commented on a diff in pull request #4676: [HUDI-3304] Support partial update payload
danny0405 commented on code in PR #4676: URL: https://github.com/apache/hudi/pull/4676#discussion_r972565604 ## hudi-common/src/main/java/org/apache/hudi/common/model/OverwriteNonDefaultsWithLatestAvroPayload.java: ## @@ -58,19 +58,32 @@ public Option combineAndGetUpdateValue(IndexedRecord currentValue GenericRecord insertRecord = (GenericRecord) recordOption.get(); GenericRecord currentRecord = (GenericRecord) currentValue; -if (isDeleteRecord(insertRecord)) { +return mergeRecords(schema, insertRecord, currentRecord); + } + + /** + * Merges the given records into one. + * + * @param schema The record schema + * @param baseRecord The base record to merge with + * @param mergedRecord The record to be merged + * + * @return the merged record option + */ + protected Option mergeRecords(Schema schema, GenericRecord baseRecord, GenericRecord mergedRecord) { +if (isDeleteRecord(baseRecord)) { return Option.empty(); } else { final GenericRecordBuilder builder = new GenericRecordBuilder(schema); List fields = schema.getFields(); fields.forEach(field -> { -Object value = insertRecord.get(field.name()); +Object value = baseRecord.get(field.name()); value = field.schema().getType().equals(Schema.Type.STRING) && value != null ? value.toString() : value; Object defaultValue = field.defaultVal(); if (!overwriteField(value, defaultValue)) { builder.set(field, value); } else { - builder.set(field, currentRecord.get(field.pos())); + builder.set(field, mergedRecord.get(field.pos())); } Review Comment: The `field.pos()` should not be used here, see https://github.com/apache/hudi/pull/6689, also we should consider the metadata fields sequence: when compaction, we should use the metadata fields for later records no matter whether it's ordering val is greater or not. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] scxwhite commented on a diff in pull request #6670: [HUDI-4842] Support compression strategy based on delte file length
scxwhite commented on code in PR #6670: URL: https://github.com/apache/hudi/pull/6670#discussion_r972559146 ## hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieCompactionConfig.java: ## @@ -106,6 +106,12 @@ public class HoodieCompactionConfig extends HoodieConfig { .withDocumentation("Only if the log file size is greater than the threshold in bytes," + " the file group will be compacted."); + public static final ConfigProperty COMPACTION_LOG_FILE_LENGTH_THRESHOLD = ConfigProperty + .key("hoodie.compaction.logfile.length.threshold") Review Comment: > `hoodie.compaction.logfile.length.threshold` -> `hoodie.compaction.logfile.num.threshold` `length` can be confused with the size of the log file. Great. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] hudi-bot commented on pull request #6489: [HUDI-4485] [cli] Bumped spring shell to 2.1.1. Updated the default …
hudi-bot commented on PR #6489: URL: https://github.com/apache/hudi/pull/6489#issuecomment-1248845860 ## CI report: * 7ea1f728918e22be5e545f0b565f4321f2e43143 Azure: [SUCCESS](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=11323) * 7f4ac87702ec40ed706e7bdf1dd5ce74ce752846 Azure: [PENDING](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=11406) Bot commands @hudi-bot supports the following commands: - `@hudi-bot run azure` re-run the last Azure build -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] hudi-bot commented on pull request #6358: [HUDI-4588][HUDI-4472] Fixing `HoodieParquetReader` to properly specify projected schema when reading Parquet file
hudi-bot commented on PR #6358: URL: https://github.com/apache/hudi/pull/6358#issuecomment-1248845763 ## CI report: * 288d166c49602a4593b1e97763a467811903737d UNKNOWN * c4b6bb8dc7a4ddce5f729e5a49ac10aad25e8931 Azure: [FAILURE](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=11394) * 2f9e8ca8d6893e973883dadcab117597ee6badd3 Azure: [PENDING](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=11405) Bot commands @hudi-bot supports the following commands: - `@hudi-bot run azure` re-run the last Azure build -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] hudi-bot commented on pull request #4676: [HUDI-3304] Support partial update payload
hudi-bot commented on PR #4676: URL: https://github.com/apache/hudi/pull/4676#issuecomment-1248844971 ## CI report: * 5944f5cbe9ce73fe6b7e27a0d381eaeb80dead38 UNKNOWN * 4ef7b451c3dd795906f3f68571256baeb330a59f UNKNOWN * 6aeb3d0d8f09aeab2a5766cf9d25ecb30537 UNKNOWN * e3914eb7b48fc4c5e3bd6f0fd00888ac6da8fa21 UNKNOWN * 24747bb7e1f23d6db70672cab3795cb131ce8dcb Azure: [SUCCESS](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=11371) * 8a1c4b87960724ac6cd0085743bc669098cdaf09 Azure: [PENDING](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=11404) Bot commands @hudi-bot supports the following commands: - `@hudi-bot run azure` re-run the last Azure build -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] scxwhite commented on a diff in pull request #6670: [HUDI-4842] Support compression strategy based on delte file length
scxwhite commented on code in PR #6670: URL: https://github.com/apache/hudi/pull/6670#discussion_r972557765 ## hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieCompactionConfig.java: ## @@ -106,6 +106,12 @@ public class HoodieCompactionConfig extends HoodieConfig { .withDocumentation("Only if the log file size is greater than the threshold in bytes," + " the file group will be compacted."); + public static final ConfigProperty COMPACTION_LOG_FILE_LENGTH_THRESHOLD = ConfigProperty + .key("hoodie.compaction.logfile.length.threshold") + .defaultValue(0L) Review Comment: > Should this be set to a reasonable value like `5` for example? Otherwise, it falls back to the behavior where all file groups are compacted. Thank you for your reply. I'm sorry, but I don't think we should adjust the default value to 5 or other values. The current default compression policy is based on the file size (LogFileSizeBasedCompactStrategy). If users want to adjust to the policy based on the number of files, and the default value is non-zero, they will find that there is no compression event triggered for a period of time, which will confuse them. So I think this value should be configured by the user, don't you think? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] xushiyan commented on a diff in pull request #6652: [HUDI-4830] Fix testNoGlobalConfFileConfigured when add hudi-defaults.conf in default dir
xushiyan commented on code in PR #6652: URL: https://github.com/apache/hudi/pull/6652#discussion_r972557608 ## hudi-common/src/test/java/org/apache/hudi/common/util/TestDFSPropertiesConfiguration.java: ## @@ -173,7 +173,9 @@ public void testNoGlobalConfFileConfigured() { ENVIRONMENT_VARIABLES.clear(DFSPropertiesConfiguration.CONF_FILE_DIR_ENV_NAME); // Should not throw any exception when no external configuration file configured DFSPropertiesConfiguration.refreshGlobalProps(); -assertEquals(0, DFSPropertiesConfiguration.getGlobalProps().size()); +DFSPropertiesConfiguration defaultDfsPropertiesConfiguration = new DFSPropertiesConfiguration(); + defaultDfsPropertiesConfiguration.addPropsFromFile(DFSPropertiesConfiguration.DEFAULT_PATH); +assertEquals(defaultDfsPropertiesConfiguration.getProps().size(), DFSPropertiesConfiguration.getGlobalProps().size()); Review Comment: so the logic should be count the configs from hudi-defaults.conf at the default path if exists, or make it 0 if not exists. @Zouxxyy can you make change for this please? then we should be good to land -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] xushiyan commented on a diff in pull request #6652: [HUDI-4830] Fix testNoGlobalConfFileConfigured when add hudi-defaults.conf in default dir
xushiyan commented on code in PR #6652: URL: https://github.com/apache/hudi/pull/6652#discussion_r972557066 ## hudi-common/src/test/java/org/apache/hudi/common/util/TestDFSPropertiesConfiguration.java: ## @@ -173,7 +173,9 @@ public void testNoGlobalConfFileConfigured() { ENVIRONMENT_VARIABLES.clear(DFSPropertiesConfiguration.CONF_FILE_DIR_ENV_NAME); // Should not throw any exception when no external configuration file configured DFSPropertiesConfiguration.refreshGlobalProps(); -assertEquals(0, DFSPropertiesConfiguration.getGlobalProps().size()); +DFSPropertiesConfiguration defaultDfsPropertiesConfiguration = new DFSPropertiesConfiguration(); + defaultDfsPropertiesConfiguration.addPropsFromFile(DFSPropertiesConfiguration.DEFAULT_PATH); +assertEquals(defaultDfsPropertiesConfiguration.getProps().size(), DFSPropertiesConfiguration.getGlobalProps().size()); Review Comment: let's not create DFSPropertiesConfiguration again to generate the expected value. we should just load the default hudi-defaults.conf with plain java properties and get the count as the expected size. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] hudi-bot commented on pull request #6489: [HUDI-4485] [cli] Bumped spring shell to 2.1.1. Updated the default …
hudi-bot commented on PR #6489: URL: https://github.com/apache/hudi/pull/6489#issuecomment-1248843297 ## CI report: * 7ea1f728918e22be5e545f0b565f4321f2e43143 Azure: [SUCCESS](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=11323) * 7f4ac87702ec40ed706e7bdf1dd5ce74ce752846 UNKNOWN Bot commands @hudi-bot supports the following commands: - `@hudi-bot run azure` re-run the last Azure build -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] hudi-bot commented on pull request #6358: [HUDI-4588][HUDI-4472] Fixing `HoodieParquetReader` to properly specify projected schema when reading Parquet file
hudi-bot commented on PR #6358: URL: https://github.com/apache/hudi/pull/6358#issuecomment-1248843191 ## CI report: * 288d166c49602a4593b1e97763a467811903737d UNKNOWN * c4b6bb8dc7a4ddce5f729e5a49ac10aad25e8931 Azure: [FAILURE](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=11394) * 2f9e8ca8d6893e973883dadcab117597ee6badd3 UNKNOWN Bot commands @hudi-bot supports the following commands: - `@hudi-bot run azure` re-run the last Azure build -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] hudi-bot commented on pull request #4676: [HUDI-3304] Support partial update payload
hudi-bot commented on PR #4676: URL: https://github.com/apache/hudi/pull/4676#issuecomment-1248842484 ## CI report: * 5944f5cbe9ce73fe6b7e27a0d381eaeb80dead38 UNKNOWN * 4ef7b451c3dd795906f3f68571256baeb330a59f UNKNOWN * 6aeb3d0d8f09aeab2a5766cf9d25ecb30537 UNKNOWN * e3914eb7b48fc4c5e3bd6f0fd00888ac6da8fa21 UNKNOWN * 24747bb7e1f23d6db70672cab3795cb131ce8dcb Azure: [SUCCESS](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=11371) * 8a1c4b87960724ac6cd0085743bc669098cdaf09 UNKNOWN Bot commands @hudi-bot supports the following commands: - `@hudi-bot run azure` re-run the last Azure build -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] hudi-bot commented on pull request #6676: [HUDI-4453] Fix schema to include partition columns in bootstrap operation
hudi-bot commented on PR #6676: URL: https://github.com/apache/hudi/pull/6676#issuecomment-1248840961 ## CI report: * fa203bff2e2bb9fc27e50f0b0c2613770bfa5dc6 Azure: [PENDING](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=11400) Bot commands @hudi-bot supports the following commands: - `@hudi-bot run azure` re-run the last Azure build -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[jira] [Updated] (HUDI-4855) Bootstrap table from Deltastreamer cannot be read in Spark
[ https://issues.apache.org/jira/browse/HUDI-4855?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] Ethan Guo updated HUDI-4855: Sprint: 2022/09/05 > Bootstrap table from Deltastreamer cannot be read in Spark > -- > > Key: HUDI-4855 > URL: https://issues.apache.org/jira/browse/HUDI-4855 > Project: Apache Hudi > Issue Type: Bug > Components: bootstrap >Reporter: Ethan Guo >Priority: Major > Fix For: 0.12.1 > > > > {code:java} > scala> val df = spark.read.format("hudi").load("") > org.apache.hudi.exception.HoodieException: No files found for reading in user > provided path. > at > org.apache.hudi.HoodieBootstrapRelation.buildFileIndex(HoodieBootstrapRelation.scala:167) > at > org.apache.hudi.HoodieBootstrapRelation.(HoodieBootstrapRelation.scala:65) > at org.apache.hudi.DefaultSource.createRelation(DefaultSource.scala:144) > at org.apache.hudi.DefaultSource.createRelation(DefaultSource.scala:68) > at > org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:350) > at > org.apache.spark.sql.DataFrameReader.loadV1Source(DataFrameReader.scala:274) > at > org.apache.spark.sql.DataFrameReader.$anonfun$load$3(DataFrameReader.scala:245) > at scala.Option.getOrElse(Option.scala:189) > at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:245) > at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:188) > ... 47 elided > {code} -- This message was sent by Atlassian Jira (v8.20.10#820010)
[jira] [Updated] (HUDI-4854) Deltastreamer does not respect partition selector regex for metadata-only bootstrap
[ https://issues.apache.org/jira/browse/HUDI-4854?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] Ethan Guo updated HUDI-4854: Sprint: 2022/09/05 > Deltastreamer does not respect partition selector regex for metadata-only > bootstrap > --- > > Key: HUDI-4854 > URL: https://issues.apache.org/jira/browse/HUDI-4854 > Project: Apache Hudi > Issue Type: Bug > Components: bootstrap >Reporter: Ethan Guo >Assignee: Ethan Guo >Priority: Major > Fix For: 0.12.1 > > -- This message was sent by Atlassian Jira (v8.20.10#820010)
[GitHub] [hudi] paul8263 commented on pull request #6489: [HUDI-4485] [cli] Bumped spring shell to 2.1.1. Updated the default …
paul8263 commented on PR #6489: URL: https://github.com/apache/hudi/pull/6489#issuecomment-1248831904 Pushed to solve the conflicts. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[jira] [Updated] (HUDI-4855) Bootstrap table from Deltastreamer cannot be read in Spark
[ https://issues.apache.org/jira/browse/HUDI-4855?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] Ethan Guo updated HUDI-4855: Component/s: bootstrap > Bootstrap table from Deltastreamer cannot be read in Spark > -- > > Key: HUDI-4855 > URL: https://issues.apache.org/jira/browse/HUDI-4855 > Project: Apache Hudi > Issue Type: Bug > Components: bootstrap >Reporter: Ethan Guo >Priority: Major > Fix For: 0.12.1 > > -- This message was sent by Atlassian Jira (v8.20.10#820010)
[jira] [Updated] (HUDI-4855) Bootstrap table from Deltastreamer cannot be read in Spark
[ https://issues.apache.org/jira/browse/HUDI-4855?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] Ethan Guo updated HUDI-4855: Description: {code:java} scala> val df = spark.read.format("hudi").load("") org.apache.hudi.exception.HoodieException: No files found for reading in user provided path. at org.apache.hudi.HoodieBootstrapRelation.buildFileIndex(HoodieBootstrapRelation.scala:167) at org.apache.hudi.HoodieBootstrapRelation.(HoodieBootstrapRelation.scala:65) at org.apache.hudi.DefaultSource.createRelation(DefaultSource.scala:144) at org.apache.hudi.DefaultSource.createRelation(DefaultSource.scala:68) at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:350) at org.apache.spark.sql.DataFrameReader.loadV1Source(DataFrameReader.scala:274) at org.apache.spark.sql.DataFrameReader.$anonfun$load$3(DataFrameReader.scala:245) at scala.Option.getOrElse(Option.scala:189) at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:245) at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:188) ... 47 elided {code} > Bootstrap table from Deltastreamer cannot be read in Spark > -- > > Key: HUDI-4855 > URL: https://issues.apache.org/jira/browse/HUDI-4855 > Project: Apache Hudi > Issue Type: Bug > Components: bootstrap >Reporter: Ethan Guo >Priority: Major > Fix For: 0.12.1 > > > > {code:java} > scala> val df = spark.read.format("hudi").load("") > org.apache.hudi.exception.HoodieException: No files found for reading in user > provided path. > at > org.apache.hudi.HoodieBootstrapRelation.buildFileIndex(HoodieBootstrapRelation.scala:167) > at > org.apache.hudi.HoodieBootstrapRelation.(HoodieBootstrapRelation.scala:65) > at org.apache.hudi.DefaultSource.createRelation(DefaultSource.scala:144) > at org.apache.hudi.DefaultSource.createRelation(DefaultSource.scala:68) > at > org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:350) > at > org.apache.spark.sql.DataFrameReader.loadV1Source(DataFrameReader.scala:274) > at > org.apache.spark.sql.DataFrameReader.$anonfun$load$3(DataFrameReader.scala:245) > at scala.Option.getOrElse(Option.scala:189) > at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:245) > at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:188) > ... 47 elided > {code} -- This message was sent by Atlassian Jira (v8.20.10#820010)
[jira] [Updated] (HUDI-4855) Bootstrap table from Deltastreamer cannot be read in Spark
[ https://issues.apache.org/jira/browse/HUDI-4855?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] Ethan Guo updated HUDI-4855: Epic Link: HUDI-1265 Story Points: 1 > Bootstrap table from Deltastreamer cannot be read in Spark > -- > > Key: HUDI-4855 > URL: https://issues.apache.org/jira/browse/HUDI-4855 > Project: Apache Hudi > Issue Type: Bug > Components: bootstrap >Reporter: Ethan Guo >Priority: Major > Fix For: 0.12.1 > > -- This message was sent by Atlassian Jira (v8.20.10#820010)
[jira] [Updated] (HUDI-4855) Bootstrap table from Deltastreamer cannot be read in Spark
[ https://issues.apache.org/jira/browse/HUDI-4855?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] Ethan Guo updated HUDI-4855: Fix Version/s: 0.12.1 > Bootstrap table from Deltastreamer cannot be read in Spark > -- > > Key: HUDI-4855 > URL: https://issues.apache.org/jira/browse/HUDI-4855 > Project: Apache Hudi > Issue Type: Bug >Reporter: Ethan Guo >Priority: Major > Fix For: 0.12.1 > > -- This message was sent by Atlassian Jira (v8.20.10#820010)
[jira] [Created] (HUDI-4855) Bootstrap table from Deltastreamer cannot be read in Spark
Ethan Guo created HUDI-4855: --- Summary: Bootstrap table from Deltastreamer cannot be read in Spark Key: HUDI-4855 URL: https://issues.apache.org/jira/browse/HUDI-4855 Project: Apache Hudi Issue Type: Bug Reporter: Ethan Guo -- This message was sent by Atlassian Jira (v8.20.10#820010)
[jira] [Updated] (HUDI-4854) Deltastreamer does not respect partition selector regex for bootstrap
[ https://issues.apache.org/jira/browse/HUDI-4854?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] Ethan Guo updated HUDI-4854: Fix Version/s: 0.12.1 > Deltastreamer does not respect partition selector regex for bootstrap > - > > Key: HUDI-4854 > URL: https://issues.apache.org/jira/browse/HUDI-4854 > Project: Apache Hudi > Issue Type: Improvement >Reporter: Ethan Guo >Assignee: Ethan Guo >Priority: Major > Fix For: 0.12.1 > > -- This message was sent by Atlassian Jira (v8.20.10#820010)
[jira] [Updated] (HUDI-4854) Deltastreamer does not respect partition selector regex for metadata-only bootstrap
[ https://issues.apache.org/jira/browse/HUDI-4854?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] Ethan Guo updated HUDI-4854: Summary: Deltastreamer does not respect partition selector regex for metadata-only bootstrap (was: Deltastreamer does not respect partition selector regex for bootstrap) > Deltastreamer does not respect partition selector regex for metadata-only > bootstrap > --- > > Key: HUDI-4854 > URL: https://issues.apache.org/jira/browse/HUDI-4854 > Project: Apache Hudi > Issue Type: Bug > Components: bootstrap >Reporter: Ethan Guo >Assignee: Ethan Guo >Priority: Major > Fix For: 0.12.1 > > -- This message was sent by Atlassian Jira (v8.20.10#820010)
[jira] [Updated] (HUDI-4854) Deltastreamer does not respect partition selector regex for bootstrap
[ https://issues.apache.org/jira/browse/HUDI-4854?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] Ethan Guo updated HUDI-4854: Story Points: 1 Issue Type: Bug (was: Improvement) > Deltastreamer does not respect partition selector regex for bootstrap > - > > Key: HUDI-4854 > URL: https://issues.apache.org/jira/browse/HUDI-4854 > Project: Apache Hudi > Issue Type: Bug >Reporter: Ethan Guo >Assignee: Ethan Guo >Priority: Major > Fix For: 0.12.1 > > -- This message was sent by Atlassian Jira (v8.20.10#820010)
[jira] [Updated] (HUDI-4854) Deltastreamer does not respect partition selector regex for bootstrap
[ https://issues.apache.org/jira/browse/HUDI-4854?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] Ethan Guo updated HUDI-4854: Component/s: bootstrap Epic Link: HUDI-1265 > Deltastreamer does not respect partition selector regex for bootstrap > - > > Key: HUDI-4854 > URL: https://issues.apache.org/jira/browse/HUDI-4854 > Project: Apache Hudi > Issue Type: Bug > Components: bootstrap >Reporter: Ethan Guo >Assignee: Ethan Guo >Priority: Major > Fix For: 0.12.1 > > -- This message was sent by Atlassian Jira (v8.20.10#820010)
[jira] [Assigned] (HUDI-4854) Deltastreamer does not respect partition selector regex for bootstrap
[ https://issues.apache.org/jira/browse/HUDI-4854?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] Ethan Guo reassigned HUDI-4854: --- Assignee: Ethan Guo > Deltastreamer does not respect partition selector regex for bootstrap > - > > Key: HUDI-4854 > URL: https://issues.apache.org/jira/browse/HUDI-4854 > Project: Apache Hudi > Issue Type: Improvement >Reporter: Ethan Guo >Assignee: Ethan Guo >Priority: Major > -- This message was sent by Atlassian Jira (v8.20.10#820010)
[jira] [Created] (HUDI-4854) Deltastreamer does not respect partition selector regex for bootstrap
Ethan Guo created HUDI-4854: --- Summary: Deltastreamer does not respect partition selector regex for bootstrap Key: HUDI-4854 URL: https://issues.apache.org/jira/browse/HUDI-4854 Project: Apache Hudi Issue Type: Improvement Reporter: Ethan Guo -- This message was sent by Atlassian Jira (v8.20.10#820010)
[hudi] branch master updated: [HUDI-4796] MetricsReporter stop bug (#6619)
This is an automated email from the ASF dual-hosted git repository. yihua pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/hudi.git The following commit(s) were added to refs/heads/master by this push: new bf64e60d31 [HUDI-4796] MetricsReporter stop bug (#6619) bf64e60d31 is described below commit bf64e60d3121c67f51859be4ce42d04c6a5704be Author: Tim Brown AuthorDate: Thu Sep 15 18:57:58 2022 -0700 [HUDI-4796] MetricsReporter stop bug (#6619) --- .../org/apache/hudi/metrics/ConsoleMetricsReporter.java | 15 --- .../org/apache/hudi/metrics/InMemoryMetricsReporter.java | 7 --- .../java/org/apache/hudi/metrics/JmxMetricsReporter.java | 7 --- .../src/main/java/org/apache/hudi/metrics/Metrics.java| 15 --- .../org/apache/hudi/metrics/MetricsGraphiteReporter.java | 6 -- .../java/org/apache/hudi/metrics/MetricsReporter.java | 4 .../metrics/cloudwatch/CloudWatchMetricsReporter.java | 6 -- .../hudi/metrics/datadog/DatadogMetricsReporter.java | 6 -- .../hudi/metrics/prometheus/PrometheusReporter.java | 13 - .../metrics/prometheus/PushGatewayMetricsReporter.java| 11 +++ .../apache/hudi/metrics/TestMetricsReporterFactory.java | 6 -- 11 files changed, 15 insertions(+), 81 deletions(-) diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metrics/ConsoleMetricsReporter.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metrics/ConsoleMetricsReporter.java index b65c4ade88..5664240c62 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metrics/ConsoleMetricsReporter.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metrics/ConsoleMetricsReporter.java @@ -18,15 +18,13 @@ package org.apache.hudi.metrics; -import java.io.Closeable; -import java.util.concurrent.TimeUnit; - -import org.apache.log4j.LogManager; -import org.apache.log4j.Logger; - import com.codahale.metrics.ConsoleReporter; import com.codahale.metrics.MetricFilter; import com.codahale.metrics.MetricRegistry; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + +import java.util.concurrent.TimeUnit; /** * Hudi Console metrics reporter. Reports the metrics by printing them to the stdout on the console. @@ -61,11 +59,6 @@ public class ConsoleMetricsReporter extends MetricsReporter { } } - @Override - public Closeable getReporter() { -return consoleReporter; - } - @Override public void stop() { if (consoleReporter != null) { diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metrics/InMemoryMetricsReporter.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metrics/InMemoryMetricsReporter.java index a145024574..96439c3b31 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metrics/InMemoryMetricsReporter.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metrics/InMemoryMetricsReporter.java @@ -18,8 +18,6 @@ package org.apache.hudi.metrics; -import java.io.Closeable; - /** * Used for testing. */ @@ -31,11 +29,6 @@ public class InMemoryMetricsReporter extends MetricsReporter { @Override public void report() {} - @Override - public Closeable getReporter() { -return null; - } - @Override public void stop() { diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metrics/JmxMetricsReporter.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metrics/JmxMetricsReporter.java index 309981a9d8..a909f62355 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metrics/JmxMetricsReporter.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metrics/JmxMetricsReporter.java @@ -26,12 +26,10 @@ import org.apache.log4j.LogManager; import javax.management.MBeanServer; -import java.io.Closeable; import java.lang.management.ManagementFactory; import java.util.Objects; import java.util.stream.IntStream; - /** * Implementation of Jmx reporter, which used to report jmx metric. */ @@ -86,11 +84,6 @@ public class JmxMetricsReporter extends MetricsReporter { public void report() { } - @Override - public Closeable getReporter() { -return jmxReporterServer.getReporter(); - } - @Override public void stop() { Objects.requireNonNull(jmxReporterServer, "jmxReporterServer is not running."); diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metrics/Metrics.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metrics/Metrics.java index d9f22bca01..8f3e497481 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metrics/Metrics.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metrics/Metrics.java @@ -27,7 +27,6 @@ import com.codahale.metrics.MetricRegistry;
[GitHub] [hudi] yihua merged pull request #6619: [HUDI-4796] MetricsReporter stop bug
yihua merged PR #6619: URL: https://github.com/apache/hudi/pull/6619 -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] yihua closed issue #6517: [SUPPORT] when using bootstrap partitioned table, partition column return null when select table
yihua closed issue #6517: [SUPPORT] when using bootstrap partitioned table, partition column return null when select table URL: https://github.com/apache/hudi/issues/6517 -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] yihua commented on issue #6517: [SUPPORT] when using bootstrap partitioned table, partition column return null when select table
yihua commented on issue #6517: URL: https://github.com/apache/hudi/issues/6517#issuecomment-1248823659 #6673 and #6676 have fixed the problem of reading the partition column from a bootstrap table and I verified that it works (see the `df.show` result below after bootstrap). Closing this issue. @peanut-chenzhong feel free to reopen this if you still see the problem. ``` scala> df.show +---+++--+++-++++++-+-+ |_hoodie_commit_time|_hoodie_commit_seqno| _hoodie_record_key|_hoodie_partition_path| _hoodie_file_name| key| ts| textField|decimalField| longField| arrayField|mapField|round|partition| +---+++--+++-++++++-+-+ | 02| 02_1_0|000-416e-f335-1f3...| 2022/1/31|356f2b69-6958-465...|000-416e-f335-1f3...|1643949407427|abcdefghijklmnopq...| 0.5398461| 4486089480226173414|[0, 1, 2, 3, 4, 5...|{4a19-ff6d-95f87c...| 0|2022/1/31| | 02| 02_1_1|000-4638-bd51-7ce...| 2022/1/31|356f2b69-6958-465...|000-4638-bd51-7ce...|1643949404254|abcdefghijklmnopq...| 0.542539|-7250499539432824960|[0, 1, 2, 3, 4, 5...|{4a1c-6792-b6a852...| 0|2022/1/31| ``` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[hudi] branch master updated (22d6019559 -> 6e31b7cef4)
This is an automated email from the ASF dual-hosted git repository. xushiyan pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/hudi.git from 22d6019559 [HUDI-4706] Fix InternalSchemaChangeApplier#applyAddChange error to add nest type (#6486) add 6e31b7cef4 [HUDI-4851] Fixing CSI not handling `InSet` operator properly (#6685) No new revisions were added by this update. Summary of changes: .../apache/spark/sql/hudi/DataSkippingUtils.scala | 20 -- .../org/apache/hudi/TestDataSkippingUtils.scala| 72 +- 2 files changed, 70 insertions(+), 22 deletions(-)
[GitHub] [hudi] yihua commented on a diff in pull request #6670: [HUDI-4842] Support compression strategy based on delte file length
yihua commented on code in PR #6670: URL: https://github.com/apache/hudi/pull/6670#discussion_r972540738 ## hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieCompactionConfig.java: ## @@ -106,6 +106,12 @@ public class HoodieCompactionConfig extends HoodieConfig { .withDocumentation("Only if the log file size is greater than the threshold in bytes," + " the file group will be compacted."); + public static final ConfigProperty COMPACTION_LOG_FILE_LENGTH_THRESHOLD = ConfigProperty + .key("hoodie.compaction.logfile.length.threshold") + .defaultValue(0L) Review Comment: Should this be set to a reasonable value like `5` for example? Otherwise, it falls back to the behavior where all file groups are compacted. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] xushiyan closed issue #6655: [SUPPORT] tryComposeIndexFilterExpr in dataskip util could support InSet expression of spark?
xushiyan closed issue #6655: [SUPPORT] tryComposeIndexFilterExpr in dataskip util could support InSet expression of spark? URL: https://github.com/apache/hudi/issues/6655 -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] xushiyan merged pull request #6685: [HUDI-4851] Fixing CSI not handling `InSet` operator properly
xushiyan merged PR #6685: URL: https://github.com/apache/hudi/pull/6685 -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[hudi] branch master updated (488f58d770 -> 22d6019559)
This is an automated email from the ASF dual-hosted git repository. mengtao pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/hudi.git from 488f58d770 [HUDI-4785] Fix partition discovery in bootstrap operation (#6673) add 22d6019559 [HUDI-4706] Fix InternalSchemaChangeApplier#applyAddChange error to add nest type (#6486) No new revisions were added by this update. Summary of changes: .../schema/action/InternalSchemaChangeApplier.java | 3 +- .../internal/schema/action/TableChangesHelper.java | 5 ++ .../internal/schema/action/TestTableChanges.java | 86 ++ 3 files changed, 93 insertions(+), 1 deletion(-)
[GitHub] [hudi] xiarixiaoyao merged pull request #6486: [HUDI-4706] Fix InternalSchemaChangeApplier#applyAddChange error to add nest type
xiarixiaoyao merged PR #6486: URL: https://github.com/apache/hudi/pull/6486 -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] yihua commented on a diff in pull request #6670: [HUDI-4842] Support compression strategy based on delte file length
yihua commented on code in PR #6670: URL: https://github.com/apache/hudi/pull/6670#discussion_r972532363 ## hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieCompactionConfig.java: ## @@ -241,41 +255,61 @@ public class HoodieCompactionConfig extends HoodieConfig { */ @Deprecated public static final String COMPACTION_STRATEGY_PROP = COMPACTION_STRATEGY.key(); - /** @deprecated Use {@link #COMPACTION_STRATEGY} and its methods instead */ + /** + * @deprecated Use {@link #COMPACTION_STRATEGY} and its methods instead + */ @Deprecated public static final String DEFAULT_COMPACTION_STRATEGY = COMPACTION_STRATEGY.defaultValue(); - /** @deprecated Use {@link #COMPACTION_LAZY_BLOCK_READ_ENABLE} and its methods instead */ + /** + * @deprecated Use {@link #COMPACTION_LAZY_BLOCK_READ_ENABLE} and its methods instead + */ @Deprecated public static final String COMPACTION_LAZY_BLOCK_READ_ENABLED_PROP = COMPACTION_LAZY_BLOCK_READ_ENABLE.key(); - /** @deprecated Use {@link #COMPACTION_LAZY_BLOCK_READ_ENABLE} and its methods instead */ + /** + * @deprecated Use {@link #COMPACTION_LAZY_BLOCK_READ_ENABLE} and its methods instead + */ @Deprecated public static final String DEFAULT_COMPACTION_LAZY_BLOCK_READ_ENABLED = COMPACTION_REVERSE_LOG_READ_ENABLE.defaultValue(); - /** @deprecated Use {@link #COMPACTION_REVERSE_LOG_READ_ENABLE} and its methods instead */ + /** + * @deprecated Use {@link #COMPACTION_REVERSE_LOG_READ_ENABLE} and its methods instead + */ @Deprecated public static final String COMPACTION_REVERSE_LOG_READ_ENABLED_PROP = COMPACTION_REVERSE_LOG_READ_ENABLE.key(); - /** @deprecated Use {@link #COMPACTION_REVERSE_LOG_READ_ENABLE} and its methods instead */ + /** + * @deprecated Use {@link #COMPACTION_REVERSE_LOG_READ_ENABLE} and its methods instead + */ @Deprecated public static final String DEFAULT_COMPACTION_REVERSE_LOG_READ_ENABLED = COMPACTION_REVERSE_LOG_READ_ENABLE.defaultValue(); + /** + * @deprecated Use {@link #TARGET_PARTITIONS_PER_DAYBASED_COMPACTION} and its methods instead + */ + @Deprecated + public static final String TARGET_PARTITIONS_PER_DAYBASED_COMPACTION_PROP = TARGET_PARTITIONS_PER_DAYBASED_COMPACTION.key(); + /** + * @deprecated Use {@link #TARGET_PARTITIONS_PER_DAYBASED_COMPACTION} and its methods instead + */ + @Deprecated + public static final String DEFAULT_TARGET_PARTITIONS_PER_DAYBASED_COMPACTION = TARGET_PARTITIONS_PER_DAYBASED_COMPACTION.defaultValue(); /** * @deprecated Use {@link #INLINE_COMPACT} and its methods instead */ @Deprecated private static final String DEFAULT_INLINE_COMPACT = INLINE_COMPACT.defaultValue(); - /** @deprecated Use {@link #INLINE_COMPACT_NUM_DELTA_COMMITS} and its methods instead */ + /** + * @deprecated Use {@link #INLINE_COMPACT_NUM_DELTA_COMMITS} and its methods instead + */ @Deprecated private static final String DEFAULT_INLINE_COMPACT_NUM_DELTA_COMMITS = INLINE_COMPACT_NUM_DELTA_COMMITS.defaultValue(); - /** @deprecated Use {@link #INLINE_COMPACT_TIME_DELTA_SECONDS} and its methods instead */ + /** + * @deprecated Use {@link #INLINE_COMPACT_TIME_DELTA_SECONDS} and its methods instead + */ @Deprecated private static final String DEFAULT_INLINE_COMPACT_TIME_DELTA_SECONDS = INLINE_COMPACT_TIME_DELTA_SECONDS.defaultValue(); - /** @deprecated Use {@link #INLINE_COMPACT_TRIGGER_STRATEGY} and its methods instead */ + /** Review Comment: Could you avoid code style changes to reduce review overhead? It's OK to have a separate PR to reformat the code. ## hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieCompactionConfig.java: ## @@ -381,6 +415,11 @@ public Builder withLogFileSizeThresholdBasedCompaction(long logFileSizeThreshold return this; } +public Builder withLogFileLengthThresholdBasedCompaction(int logFileLengthThreshold) { + compactionConfig.setValue(COMPACTION_LOG_FILE_LENGTH_THRESHOLD, String.valueOf(logFileLengthThreshold)); + return this; Review Comment: Same here for method name and variable renaming. ## hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieCompactionConfig.java: ## @@ -106,6 +106,12 @@ public class HoodieCompactionConfig extends HoodieConfig { .withDocumentation("Only if the log file size is greater than the threshold in bytes," + " the file group will be compacted."); + public static final ConfigProperty COMPACTION_LOG_FILE_LENGTH_THRESHOLD = ConfigProperty + .key("hoodie.compaction.logfile.length.threshold") Review Comment: `hoodie.compaction.logfile.length.threshold` -> `hoodie.compaction.logfile.num.threshold` `length` can be confused with the size of the log file. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub
[GitHub] [hudi] hudi-bot commented on pull request #6676: [HUDI-4453] Fix schema to include partition columns in bootstrap operation
hudi-bot commented on PR #6676: URL: https://github.com/apache/hudi/pull/6676#issuecomment-1248813922 ## CI report: * fa203bff2e2bb9fc27e50f0b0c2613770bfa5dc6 UNKNOWN Bot commands @hudi-bot supports the following commands: - `@hudi-bot run azure` re-run the last Azure build -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] hudi-bot commented on pull request #6689: [HUDI-4853] Get field by name for OverwriteNonDefaultsWithLatestAvroP…
hudi-bot commented on PR #6689: URL: https://github.com/apache/hudi/pull/6689#issuecomment-1248808740 ## CI report: * cab4b6a3b31aff9a0aa4a825d341346aaa7ede73 Azure: [PENDING](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=11399) Bot commands @hudi-bot supports the following commands: - `@hudi-bot run azure` re-run the last Azure build -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] hudi-bot commented on pull request #6688: Fix AWSDmsAvroPayload#combineAndGetUpdateValue when using MOR snapshot query after delete operations
hudi-bot commented on PR #6688: URL: https://github.com/apache/hudi/pull/6688#issuecomment-1248808723 ## CI report: * fff1405467fb5f6a7fdb6d3d043714e268f1c875 Azure: [PENDING](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=11398) Bot commands @hudi-bot supports the following commands: - `@hudi-bot run azure` re-run the last Azure build -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] hudi-bot commented on pull request #6689: [HUDI-4853] Get field by name for OverwriteNonDefaultsWithLatestAvroP…
hudi-bot commented on PR #6689: URL: https://github.com/apache/hudi/pull/6689#issuecomment-1248806292 ## CI report: * cab4b6a3b31aff9a0aa4a825d341346aaa7ede73 UNKNOWN Bot commands @hudi-bot supports the following commands: - `@hudi-bot run azure` re-run the last Azure build -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] hudi-bot commented on pull request #6688: Fix AWSDmsAvroPayload#combineAndGetUpdateValue when using MOR snapshot query after delete operations
hudi-bot commented on PR #6688: URL: https://github.com/apache/hudi/pull/6688#issuecomment-1248806264 ## CI report: * fff1405467fb5f6a7fdb6d3d043714e268f1c875 UNKNOWN Bot commands @hudi-bot supports the following commands: - `@hudi-bot run azure` re-run the last Azure build -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] hudi-bot commented on pull request #6615: [HUDI-4758] Add validations to java spark examples
hudi-bot commented on PR #6615: URL: https://github.com/apache/hudi/pull/6615#issuecomment-1248803426 ## CI report: * d675d338c90b09abbdbcc84003873cf05c40f871 Azure: [SUCCESS](https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_build/results?buildId=11395) Bot commands @hudi-bot supports the following commands: - `@hudi-bot run azure` re-run the last Azure build -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[hudi] branch master updated: [HUDI-4785] Fix partition discovery in bootstrap operation (#6673)
This is an automated email from the ASF dual-hosted git repository. yihua pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/hudi.git The following commit(s) were added to refs/heads/master by this push: new 488f58d770 [HUDI-4785] Fix partition discovery in bootstrap operation (#6673) 488f58d770 is described below commit 488f58d770137057532196065f2f69eea1a15db8 Author: Sagar Sumit AuthorDate: Fri Sep 16 06:36:44 2022 +0530 [HUDI-4785] Fix partition discovery in bootstrap operation (#6673) Co-authored-by: Y Ethan Guo --- .../apache/hudi/config/HoodieBootstrapConfig.java | 20 + .../SparkBootstrapCommitActionExecutor.java| 47 -- .../SparkBootstrapDeltaCommitActionExecutor.java | 12 -- .../hudi/common/table/TableSchemaResolver.java | 34 .../hudi/common/table/TestTableSchemaResolver.java | 12 +++--- .../org/apache/hudi/HoodieBootstrapRelation.scala | 7 ++-- .../SparkFullBootstrapDataProviderBase.java| 4 +- .../functional/TestDataSourceForBootstrap.scala| 18 +++-- 8 files changed, 99 insertions(+), 55 deletions(-) diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieBootstrapConfig.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieBootstrapConfig.java index 94bb7830cc..0b9116b01c 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieBootstrapConfig.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieBootstrapConfig.java @@ -34,6 +34,9 @@ import java.io.FileReader; import java.io.IOException; import java.util.Properties; +import static org.apache.hudi.client.bootstrap.BootstrapMode.FULL_RECORD; +import static org.apache.hudi.client.bootstrap.BootstrapMode.METADATA_ONLY; + /** * Bootstrap specific configs. */ @@ -50,6 +53,15 @@ public class HoodieBootstrapConfig extends HoodieConfig { .sinceVersion("0.6.0") .withDocumentation("Base path of the dataset that needs to be bootstrapped as a Hudi table"); + public static final ConfigProperty PARTITION_SELECTOR_REGEX_MODE = ConfigProperty + .key("hoodie.bootstrap.mode.selector.regex.mode") + .defaultValue(METADATA_ONLY.name()) + .sinceVersion("0.6.0") + .withValidValues(METADATA_ONLY.name(), FULL_RECORD.name()) + .withDocumentation("Bootstrap mode to apply for partition paths, that match regex above. " + + "METADATA_ONLY will generate just skeleton base files with keys/footers, avoiding full cost of rewriting the dataset. " + + "FULL_RECORD will perform a full copy/rewrite of the data as a Hudi table."); + public static final ConfigProperty MODE_SELECTOR_CLASS_NAME = ConfigProperty .key("hoodie.bootstrap.mode.selector") .defaultValue(MetadataOnlyBootstrapModeSelector.class.getCanonicalName()) @@ -92,14 +104,6 @@ public class HoodieBootstrapConfig extends HoodieConfig { .sinceVersion("0.6.0") .withDocumentation("Matches each bootstrap dataset partition against this regex and applies the mode below to it."); - public static final ConfigProperty PARTITION_SELECTOR_REGEX_MODE = ConfigProperty - .key("hoodie.bootstrap.mode.selector.regex.mode") - .defaultValue(BootstrapMode.METADATA_ONLY.name()) - .sinceVersion("0.6.0") - .withDocumentation("Bootstrap mode to apply for partition paths, that match regex above. " - + "METADATA_ONLY will generate just skeleton base files with keys/footers, avoiding full cost of rewriting the dataset. " - + "FULL_RECORD will perform a full copy/rewrite of the data as a Hudi table."); - public static final ConfigProperty INDEX_CLASS_NAME = ConfigProperty .key("hoodie.bootstrap.index.class") .defaultValue(HFileBootstrapIndex.class.getName()) diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/bootstrap/SparkBootstrapCommitActionExecutor.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/bootstrap/SparkBootstrapCommitActionExecutor.java index a2ee384940..88f6a54e0d 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/bootstrap/SparkBootstrapCommitActionExecutor.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/bootstrap/SparkBootstrapCommitActionExecutor.java @@ -26,6 +26,7 @@ import org.apache.hudi.client.bootstrap.FullRecordBootstrapDataProvider; import org.apache.hudi.client.bootstrap.HoodieBootstrapSchemaProvider; import org.apache.hudi.client.bootstrap.HoodieSparkBootstrapSchemaProvider; import org.apache.hudi.client.bootstrap.selector.BootstrapModeSelector; +import org.apache.hudi.client.bootstrap.selector.FullRecordBootstrapModeSelector; import org.apache.hudi.client.bootstrap.translator.BootstrapPartitionPathTranslator; import
[GitHub] [hudi] yihua merged pull request #6673: [HUDI-4785] Fix partition discovery in bootstrap operation
yihua merged PR #6673: URL: https://github.com/apache/hudi/pull/6673 -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] yihua commented on pull request #6673: [HUDI-4785] Fix partition discovery in bootstrap operation
yihua commented on PR #6673: URL: https://github.com/apache/hudi/pull/6673#issuecomment-1248801714 Merging this as the rebasing only touches the `TestDataSourceForBootstrap` and it passes locally. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] 5herhom commented on pull request #6031: [HUDI-4282] Repair IOException in some other dfs, except hdfs,when check block corrupted in HoodieLogFileReader
5herhom commented on PR #6031: URL: https://github.com/apache/hudi/pull/6031#issuecomment-1248796155 > @5herhom : can you follow up on the feedback. its nearing landing. Sorry, I'm busy these days. I will commit in two days -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[jira] [Updated] (HUDI-4853) Get field by name for OverwriteNonDefaultsWithLatestAvroPayload to avoid schema mismatch
[ https://issues.apache.org/jira/browse/HUDI-4853?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] ASF GitHub Bot updated HUDI-4853: - Labels: pull-request-available (was: ) > Get field by name for OverwriteNonDefaultsWithLatestAvroPayload to avoid > schema mismatch > > > Key: HUDI-4853 > URL: https://issues.apache.org/jira/browse/HUDI-4853 > Project: Apache Hudi > Issue Type: Bug > Components: core >Affects Versions: 0.12.0 >Reporter: Danny Chen >Priority: Major > Labels: pull-request-available > Fix For: 0.12.1 > > -- This message was sent by Atlassian Jira (v8.20.10#820010)
[GitHub] [hudi] danny0405 opened a new pull request, #6689: [HUDI-4853] Get field by name for OverwriteNonDefaultsWithLatestAvroP…
danny0405 opened a new pull request, #6689: URL: https://github.com/apache/hudi/pull/6689 …ayload to avoid schema mismatch ### Change Logs _Describe context and summary for this change. Highlight if any code was copied._ ### Impact _Describe any public API or user-facing feature change or any performance impact._ **Risk level: none | low | medium | high** _Choose one. If medium or high, explain what verification was done to mitigate the risks._ ### Contributor's checklist - [ ] Read through [contributor's guide](https://hudi.apache.org/contribute/how-to-contribute) - [ ] Change Logs and Impact were stated clearly - [ ] Adequate tests were added if applicable - [ ] CI passed -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] paul8263 commented on a diff in pull request #6489: [HUDI-4485] [cli] Bumped spring shell to 2.1.1. Updated the default …
paul8263 commented on code in PR #6489: URL: https://github.com/apache/hudi/pull/6489#discussion_r972521635 ## hudi-cli/src/main/java/org/apache/hudi/cli/commands/SparkMain.java: ## @@ -86,7 +87,7 @@ */ public class SparkMain { - private static final Logger LOG = Logger.getLogger(SparkMain.class); + private static final Logger LOG = LogManager.getLogger(SparkMain.class); Review Comment: Hi @rahil-c , Logger belongs to log4j-1.2-api while LogManager is log4j 2.x API. Previously I changed all logs to slf4j but now restored all of them to log4j2. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] rahil-c opened a new pull request, #6688: Fix AWSDmsAvroPayload#combineAndGetUpdateValue when using MOR snapshot query after delete operations
rahil-c opened a new pull request, #6688: URL: https://github.com/apache/hudi/pull/6688 ### Change Logs _Describe context and summary for this change. Highlight if any code was copied._ ### Impact _Describe any public API or user-facing feature change or any performance impact._ **Risk level: none | low | medium | high** _Choose one. If medium or high, explain what verification was done to mitigate the risks._ ### Contributor's checklist - [ ] Read through [contributor's guide](https://hudi.apache.org/contribute/how-to-contribute) - [ ] Change Logs and Impact were stated clearly - [ ] Adequate tests were added if applicable - [ ] CI passed -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [hudi] yihua commented on a diff in pull request #6673: [HUDI-4785] Fix partition discovery in bootstrap operation
yihua commented on code in PR #6673: URL: https://github.com/apache/hudi/pull/6673#discussion_r972517320 ## hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieBootstrapRelation.scala: ## @@ -147,7 +146,7 @@ class HoodieBootstrapRelation(@transient val _sqlContext: SQLContext, if (fullSchema == null) { logInfo("Inferring schema..") val schemaResolver = new TableSchemaResolver(metaClient) - val tableSchema = schemaResolver.getTableAvroSchemaWithoutMetadataFields + val tableSchema = TableSchemaResolver.appendPartitionColumns(schemaResolver.getTableAvroSchemaWithoutMetadataFields, metaClient.getTableConfig.getPartitionFields) Review Comment: We should also fix the table schema stored inside the commit metadata to include the partition column with the correct inferred type, fixed in #6676. ## hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieBootstrapConfig.java: ## @@ -50,9 +53,25 @@ public class HoodieBootstrapConfig extends HoodieConfig { .sinceVersion("0.6.0") .withDocumentation("Base path of the dataset that needs to be bootstrapped as a Hudi table"); + public static final ConfigProperty PARTITION_SELECTOR_REGEX_MODE = ConfigProperty + .key("hoodie.bootstrap.mode.selector.regex.mode") + .defaultValue(METADATA_ONLY.name()) + .sinceVersion("0.6.0") + .withValidValues(METADATA_ONLY.name(), FULL_RECORD.name()) + .withDocumentation("Bootstrap mode to apply for partition paths, that match regex above. " + + "METADATA_ONLY will generate just skeleton base files with keys/footers, avoiding full cost of rewriting the dataset. " + + "FULL_RECORD will perform a full copy/rewrite of the data as a Hudi table."); + public static final ConfigProperty MODE_SELECTOR_CLASS_NAME = ConfigProperty .key("hoodie.bootstrap.mode.selector") .defaultValue(MetadataOnlyBootstrapModeSelector.class.getCanonicalName()) + /*.withInferFunction(cfg -> { Review Comment: nit: remove unused code -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org