This is an automated email from the ASF dual-hosted git repository.

Jackie-Jiang pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/pinot.git


The following commit(s) were added to refs/heads/master by this push:
     new 3f58dba6317 Add optional source-field data type fix during ingestion 
(#18816)
3f58dba6317 is described below

commit 3f58dba6317c3987f4ab67ff2264ab5548a541ab
Author: Xiaotian (Jackie) Jiang <[email protected]>
AuthorDate: Sun Jun 21 11:30:17 2026 -0700

    Add optional source-field data type fix during ingestion (#18816)
---
 .../recordtransformer/DataTypeTransformer.java     | 44 ++++++++++---
 .../recordtransformer/RecordTransformerUtils.java  | 35 ++++++++++
 .../segment/local/utils/TableConfigUtils.java      | 18 +++++
 .../recordtransformer/RecordTransformerTest.java   | 48 ++++++++++++++
 .../segment/local/utils/TableConfigUtilsTest.java  | 31 +++++++++
 .../config/table/ingestion/IngestionConfig.java    | 48 ++++++--------
 .../config/table/ingestion/SourceFieldConfig.java  | 76 ++++++++++++++++++++++
 .../table/ingestion/SourceFieldConfigTest.java     | 71 ++++++++++++++++++++
 8 files changed, 333 insertions(+), 38 deletions(-)

diff --git 
a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/recordtransformer/DataTypeTransformer.java
 
b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/recordtransformer/DataTypeTransformer.java
index cd951cbdd51..6deb8390855 100644
--- 
a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/recordtransformer/DataTypeTransformer.java
+++ 
b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/recordtransformer/DataTypeTransformer.java
@@ -35,27 +35,53 @@ import org.slf4j.LoggerFactory;
 
 
 /**
- * The {@code DataTypeTransformer} class will convert the values to follow the 
data types in {@link FieldSpec}.
- * <p>NOTE: should put this after all the values has been generated by other 
transformers (such as
- * {@link ExpressionTransformer}). After this, all values should be of the 
desired data types.
+ * The {@code DataTypeTransformer} class converts column values to follow the 
configured {@link PinotDataType}s. It has
+ * two usages:
+ * <ul>
+ *   <li>Schema columns: constructed from the {@link Schema}, it should run 
after all the values have been generated by
+ *   other transformers (such as {@link ExpressionTransformer}) so that all 
values follow the data types in
+ *   {@link FieldSpec}.</li>
+ *   <li>Source fields: constructed from an explicit column-to-{@link 
PinotDataType} map (see
+ *   {@code IngestionConfig.sourceFieldConfigs}), it runs before other 
transformers (such as
+ *   {@link ExpressionTransformer}) to fix the data types of the source fields 
they consume.</li>
+ * </ul>
  */
-@SuppressWarnings("rawtypes")
 public class DataTypeTransformer implements RecordTransformer {
   private static final Logger LOGGER = 
LoggerFactory.getLogger(DataTypeTransformer.class);
 
-  private final Map<String, PinotDataType> _dataTypes = new HashMap<>();
+  private final Map<String, PinotDataType> _dataTypes;
   private final boolean _continueOnError;
   private final ThrottledLogger _throttledLogger;
 
+  /// Creates a [DataTypeTransformer] that converts the (non-virtual) schema 
columns to the data types defined in the
+  /// [Schema].
   public DataTypeTransformer(TableConfig tableConfig, Schema schema) {
+    this(tableConfig, extractSchemaDataTypes(schema));
+  }
+
+  /// Creates a [DataTypeTransformer] that converts the given columns to the 
provided [PinotDataType]s. This is useful
+  /// for fixing the data types of source fields before other transformers 
(such as [ExpressionTransformer]) consume
+  /// them.
+  public DataTypeTransformer(TableConfig tableConfig, Map<String, 
PinotDataType> dataTypes) {
+    _dataTypes = dataTypes;
+    IngestionConfig ingestionConfig = tableConfig.getIngestionConfig();
+    _continueOnError = ingestionConfig != null && 
ingestionConfig.isContinueOnError();
+    _throttledLogger = new ThrottledLogger(LOGGER, ingestionConfig);
+  }
+
+  private static Map<String, PinotDataType> extractSchemaDataTypes(Schema 
schema) {
+    Map<String, PinotDataType> dataTypes = new HashMap<>();
     for (FieldSpec fieldSpec : schema.getAllFieldSpecs()) {
       if (!fieldSpec.isVirtualColumn()) {
-        _dataTypes.put(fieldSpec.getName(), 
PinotDataType.getPinotDataTypeForIngestion(fieldSpec));
+        dataTypes.put(fieldSpec.getName(), 
PinotDataType.getPinotDataTypeForIngestion(fieldSpec));
       }
     }
-    IngestionConfig ingestionConfig = tableConfig.getIngestionConfig();
-    _continueOnError = ingestionConfig != null && 
ingestionConfig.isContinueOnError();
-    _throttledLogger = new ThrottledLogger(LOGGER, ingestionConfig);
+    return dataTypes;
+  }
+
+  @Override
+  public boolean isNoOp() {
+    return _dataTypes.isEmpty();
   }
 
   @Override
diff --git 
a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/recordtransformer/RecordTransformerUtils.java
 
b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/recordtransformer/RecordTransformerUtils.java
index 60f9f03aeca..80a65f9ad0c 100644
--- 
a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/recordtransformer/RecordTransformerUtils.java
+++ 
b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/recordtransformer/RecordTransformerUtils.java
@@ -21,18 +21,22 @@ package org.apache.pinot.segment.local.recordtransformer;
 import com.google.common.base.Preconditions;
 import java.io.IOException;
 import java.util.ArrayList;
+import java.util.HashMap;
 import java.util.List;
+import java.util.Map;
 import javax.annotation.Nullable;
 import org.apache.commons.collections4.CollectionUtils;
 import org.apache.pinot.spi.config.table.TableConfig;
 import org.apache.pinot.spi.config.table.UpsertConfig;
 import org.apache.pinot.spi.config.table.ingestion.EnrichmentConfig;
 import org.apache.pinot.spi.config.table.ingestion.IngestionConfig;
+import org.apache.pinot.spi.config.table.ingestion.SourceFieldConfig;
 import org.apache.pinot.spi.config.table.ingestion.TransformConfig;
 import org.apache.pinot.spi.data.Schema;
 import org.apache.pinot.spi.recordtransformer.RecordTransformer;
 import org.apache.pinot.spi.recordtransformer.enricher.RecordEnricher;
 import org.apache.pinot.spi.recordtransformer.enricher.RecordEnricherRegistry;
+import org.apache.pinot.spi.utils.PinotDataType;
 
 
 public class RecordTransformerUtils {
@@ -42,9 +46,15 @@ public class RecordTransformerUtils {
   /// Returns a list of [RecordTransformer]s based on the given [TableConfig] 
and [Schema].
   /// DO NOT CHANGE THE ORDER OF THE RECORD TRANSFORMERS.
   /// The transformers returned are:
+  /// - (Optional) [DataTypeTransformer] to fix the data types of the source 
fields configured with
+  /// `preComplexTypeTransform = true` in 
`IngestionConfig#getSourceFieldConfigs()`. It precedes the pre-complex-type
+  /// [RecordEnricher]s and [ComplexTypeTransformer] so that they consume the 
source fields with the corrected types.
   /// - (Optional) [RecordEnricher]s to enrich the records before complex type 
transformation.
   /// - (Optional) [ComplexTypeTransformer] to flatten map/unnest list.
   /// - (Optional) Custom [RecordTransformer]s
+  /// - (Optional) [DataTypeTransformer] to fix the data types of the source 
fields configured with
+  /// `preComplexTypeTransform = false` in 
`IngestionConfig#getSourceFieldConfigs()`. It precedes the post-complex-type
+  /// [RecordEnricher]s and [ExpressionTransformer] so that they consume the 
source fields with the corrected types.
   /// - (Optional) [RecordEnricher]s to enrich the records before other 
transformations.
   /// - (Optional) [ExpressionTransformer] to evaluate expressions and fill 
the values.
   /// - (Optional) [FilterTransformer] to filter records based on custom 
predicates.
@@ -64,6 +74,7 @@ public class RecordTransformerUtils {
       boolean skipPostComplexTypeTransformers, boolean skipFilterTransformer) {
     List<RecordTransformer> transformers = new ArrayList<>();
     if (!skipPreComplexTypeTransformers) {
+      addSourceFieldDataTypeTransformer(tableConfig, transformers, true);
       addRecordEnricherTransformers(tableConfig, transformers, true);
     }
     if (!skipComplexTypeTransformer) {
@@ -74,6 +85,7 @@ public class RecordTransformerUtils {
     }
     Preconditions.checkState(schema != null,
         "Schema must be provided when post complex type transformers are 
requested");
+    addSourceFieldDataTypeTransformer(tableConfig, transformers, false);
     addRecordEnricherTransformers(tableConfig, transformers, false);
     addIfNotNoOp(transformers, new ExpressionTransformer(tableConfig, schema));
     if (!skipFilterTransformer) {
@@ -132,6 +144,29 @@ public class RecordTransformerUtils {
     }
   }
 
+  private static void addSourceFieldDataTypeTransformer(TableConfig 
tableConfig, List<RecordTransformer> transformers,
+      boolean preComplexTypeTransform) {
+    IngestionConfig ingestionConfig = tableConfig.getIngestionConfig();
+    if (ingestionConfig == null) {
+      return;
+    }
+    List<SourceFieldConfig> sourceFieldConfigs = 
ingestionConfig.getSourceFieldConfigs();
+    if (CollectionUtils.isEmpty(sourceFieldConfigs)) {
+      return;
+    }
+    Map<String, PinotDataType> dataTypes = new HashMap<>();
+    for (SourceFieldConfig sourceFieldConfig : sourceFieldConfigs) {
+      // If pre-ComplexType transformers are requested, add only 
pre-ComplexType source fields. Similarly, if
+      // non pre-ComplexType transformers are requested, add only non 
pre-ComplexType source fields.
+      if (sourceFieldConfig.isPreComplexTypeTransform() == 
preComplexTypeTransform) {
+        dataTypes.put(sourceFieldConfig.getName(), 
sourceFieldConfig.getDataType());
+      }
+    }
+    if (!dataTypes.isEmpty()) {
+      transformers.add(new DataTypeTransformer(tableConfig, dataTypes));
+    }
+  }
+
   private static void addRecordEnricherTransformers(TableConfig tableConfig, 
List<RecordTransformer> transformers,
       boolean preComplexTypeTransformers) {
     IngestionConfig ingestionConfig = tableConfig.getIngestionConfig();
diff --git 
a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/utils/TableConfigUtils.java
 
b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/utils/TableConfigUtils.java
index ff15a2fd0f4..9106f7fe56b 100644
--- 
a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/utils/TableConfigUtils.java
+++ 
b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/utils/TableConfigUtils.java
@@ -89,6 +89,7 @@ import 
org.apache.pinot.spi.config.table.ingestion.EnrichmentConfig;
 import org.apache.pinot.spi.config.table.ingestion.FilterConfig;
 import org.apache.pinot.spi.config.table.ingestion.IngestionConfig;
 import 
org.apache.pinot.spi.config.table.ingestion.SchemaConformingTransformerConfig;
+import org.apache.pinot.spi.config.table.ingestion.SourceFieldConfig;
 import org.apache.pinot.spi.config.table.ingestion.StreamIngestionConfig;
 import org.apache.pinot.spi.config.table.ingestion.TransformConfig;
 import org.apache.pinot.spi.data.DateTimeFieldSpec;
@@ -648,6 +649,23 @@ public final class TableConfigUtils {
         });
       }
 
+      // Source field configs
+      List<SourceFieldConfig> sourceFieldConfigs = 
ingestionConfig.getSourceFieldConfigs();
+      if (sourceFieldConfigs != null) {
+        // A source field can be configured once per phase (pre- and 
post-complex-type), but not twice within the same
+        // phase, because that would yield two DataTypeTransformers targeting 
it in the same phase, which is ambiguous.
+        Set<String> preComplexTypeFieldNames = new HashSet<>();
+        Set<String> postComplexTypeFieldNames = new HashSet<>();
+        for (SourceFieldConfig sourceFieldConfig : sourceFieldConfigs) {
+          String name = sourceFieldConfig.getName();
+          boolean preComplexTypeTransform = 
sourceFieldConfig.isPreComplexTypeTransform();
+          Set<String> fieldNames = preComplexTypeTransform ? 
preComplexTypeFieldNames : postComplexTypeFieldNames;
+          Preconditions.checkState(fieldNames.add(name),
+              "Duplicate SourceFieldConfig found for source field: %s 
(preComplexTypeTransform: %s)", name,
+              preComplexTypeTransform);
+        }
+      }
+
       // Enrichment configs
       List<EnrichmentConfig> enrichmentConfigs = 
ingestionConfig.getEnrichmentConfigs();
       if (enrichmentConfigs != null) {
diff --git 
a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/recordtransformer/RecordTransformerTest.java
 
b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/recordtransformer/RecordTransformerTest.java
index fa46eea8655..76fd616aaaa 100644
--- 
a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/recordtransformer/RecordTransformerTest.java
+++ 
b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/recordtransformer/RecordTransformerTest.java
@@ -24,6 +24,8 @@ import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collections;
 import java.util.List;
+import java.util.Map;
+import java.util.Set;
 import java.util.concurrent.TimeUnit;
 import org.apache.pinot.common.utils.ServiceStartableUtils;
 import org.apache.pinot.segment.local.segment.creator.TransformPipeline;
@@ -32,6 +34,7 @@ import org.apache.pinot.spi.config.table.TableType;
 import org.apache.pinot.spi.config.table.ingestion.FilterConfig;
 import org.apache.pinot.spi.config.table.ingestion.IngestionConfig;
 import 
org.apache.pinot.spi.config.table.ingestion.SchemaConformingTransformerConfig;
+import org.apache.pinot.spi.config.table.ingestion.SourceFieldConfig;
 import org.apache.pinot.spi.config.table.ingestion.TransformConfig;
 import org.apache.pinot.spi.data.DateTimeFormatSpec;
 import org.apache.pinot.spi.data.DimensionFieldSpec;
@@ -43,6 +46,7 @@ import org.apache.pinot.spi.data.readers.GenericRow;
 import org.apache.pinot.spi.env.PinotConfiguration;
 import org.apache.pinot.spi.recordtransformer.RecordTransformer;
 import org.apache.pinot.spi.utils.CommonConstants;
+import org.apache.pinot.spi.utils.PinotDataType;
 import org.apache.pinot.spi.utils.builder.TableConfigBuilder;
 import org.testng.annotations.Test;
 
@@ -568,6 +572,50 @@ public class RecordTransformerTest {
     assertTrue(transformers.get(7) instanceof SanitizationTransformer);
   }
 
+  @Test
+  public void testSourceFieldDataTypeTransformerOrder() {
+    // The pre-complex-type source-field transformer runs first; the 
post-complex-type one runs right before the
+    // ExpressionTransformer. With no complex-type transformer or enricher 
configured, they are the first two
+    // transformers in the list.
+    Schema schema = new 
Schema.SchemaBuilder().addSingleValueDimension("svInt", DataType.INT)
+        .addSingleValueDimension("expressionTestColumn", DataType.INT)
+        .build();
+
+    IngestionConfig ingestionConfig = new IngestionConfig();
+    ingestionConfig.setSourceFieldConfigs(List.of(
+        new SourceFieldConfig("preField", PinotDataType.LONG, true),
+        new SourceFieldConfig("postField", PinotDataType.STRING, false)
+    ));
+    ingestionConfig.setTransformConfigs(List.of(new 
TransformConfig("expressionTestColumn", "plus(svInt, 10)")));
+    TableConfig tableConfig = new 
TableConfigBuilder(TableType.OFFLINE).setTableName("testTable")
+        .setIngestionConfig(ingestionConfig)
+        .build();
+
+    List<RecordTransformer> transformers = 
RecordTransformerUtils.getDefaultTransformers(tableConfig, schema);
+    assertTrue(transformers.get(0) instanceof DataTypeTransformer);
+    assertEquals(transformers.get(0).getInputColumns(), Set.of("preField"));
+    assertTrue(transformers.get(1) instanceof DataTypeTransformer);
+    assertEquals(transformers.get(1).getInputColumns(), Set.of("postField"));
+    assertTrue(transformers.get(2) instanceof ExpressionTransformer);
+  }
+
+  @Test
+  public void testSourceFieldDataTypeConversion() {
+    DataTypeTransformer transformer = new DataTypeTransformer(TABLE_CONFIG, 
Map.of("srcLong", PinotDataType.LONG));
+
+    // A mistyped (String) source value is converted to the configured type.
+    GenericRow record = new GenericRow();
+    record.putValue("srcLong", "12345");
+    transformer.transform(record);
+    assertEquals(record.getValue("srcLong"), 12345L);
+
+    // A null source value is handled without throwing.
+    GenericRow nullRecord = new GenericRow();
+    nullRecord.putValue("srcLong", null);
+    transformer.transform(nullRecord);
+    assertNull(nullRecord.getValue("srcLong"));
+  }
+
   @Test
   public void testScalarOps() {
     IngestionConfig ingestionConfig = new IngestionConfig();
diff --git 
a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/utils/TableConfigUtilsTest.java
 
b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/utils/TableConfigUtilsTest.java
index 888c2de86d5..e847de4d263 100644
--- 
a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/utils/TableConfigUtilsTest.java
+++ 
b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/utils/TableConfigUtilsTest.java
@@ -63,6 +63,7 @@ import 
org.apache.pinot.spi.config.table.ingestion.BatchIngestionConfig;
 import org.apache.pinot.spi.config.table.ingestion.ComplexTypeConfig;
 import org.apache.pinot.spi.config.table.ingestion.FilterConfig;
 import org.apache.pinot.spi.config.table.ingestion.IngestionConfig;
+import org.apache.pinot.spi.config.table.ingestion.SourceFieldConfig;
 import org.apache.pinot.spi.config.table.ingestion.StreamIngestionConfig;
 import org.apache.pinot.spi.config.table.ingestion.TransformConfig;
 import org.apache.pinot.spi.data.DimensionFieldSpec;
@@ -82,6 +83,7 @@ import org.apache.pinot.spi.utils.CommonConstants;
 import org.apache.pinot.spi.utils.CommonConstants.Segment.AssignmentStrategy;
 import org.apache.pinot.spi.utils.Enablement;
 import org.apache.pinot.spi.utils.JsonUtils;
+import org.apache.pinot.spi.utils.PinotDataType;
 import org.apache.pinot.spi.utils.PinotMd5Mode;
 import org.apache.pinot.spi.utils.builder.TableConfigBuilder;
 import org.mockito.Mockito;
@@ -569,6 +571,35 @@ public class TableConfigUtilsTest {
     }
   }
 
+  @Test
+  public void ingestionSourceFieldConfigsTest() {
+    Schema schema = new 
Schema.SchemaBuilder().setSchemaName(TABLE_NAME).build();
+    IngestionConfig ingestionConfig = new IngestionConfig();
+    TableConfig tableConfig =
+        new 
TableConfigBuilder(TableType.OFFLINE).setTableName(TABLE_NAME).setIngestionConfig(ingestionConfig).build();
+
+    // A field configured once per phase, plus a field shared across phases, 
are all valid.
+    ingestionConfig.setSourceFieldConfigs(List.of(
+        new SourceFieldConfig("preOnly", PinotDataType.LONG, true),
+        new SourceFieldConfig("postOnly", PinotDataType.STRING, false),
+        new SourceFieldConfig("shared", PinotDataType.LONG, true),
+        new SourceFieldConfig("shared", PinotDataType.INT, false)
+    ));
+    TableConfigUtils.validateIngestionConfig(tableConfig, schema);
+
+    // The same field twice within the same phase is rejected.
+    ingestionConfig.setSourceFieldConfigs(List.of(
+        new SourceFieldConfig("dup", PinotDataType.LONG, false),
+        new SourceFieldConfig("dup", PinotDataType.INT, false)
+    ));
+    try {
+      TableConfigUtils.validateIngestionConfig(tableConfig, schema);
+      fail("Should fail on duplicate SourceFieldConfig within the same phase");
+    } catch (IllegalStateException e) {
+      // expected
+    }
+  }
+
   @Test
   public void ingestionAggregationConfigsTest() {
     Schema schema = new Schema.SchemaBuilder().setSchemaName(TABLE_NAME)
diff --git 
a/pinot-spi/src/main/java/org/apache/pinot/spi/config/table/ingestion/IngestionConfig.java
 
b/pinot-spi/src/main/java/org/apache/pinot/spi/config/table/ingestion/IngestionConfig.java
index c126c9b0487..e18e15a5e1f 100644
--- 
a/pinot-spi/src/main/java/org/apache/pinot/spi/config/table/ingestion/IngestionConfig.java
+++ 
b/pinot-spi/src/main/java/org/apache/pinot/spi/config/table/ingestion/IngestionConfig.java
@@ -18,7 +18,6 @@
  */
 package org.apache.pinot.spi.config.table.ingestion;
 
-import com.fasterxml.jackson.annotation.JsonProperty;
 import com.fasterxml.jackson.annotation.JsonPropertyDescription;
 import java.util.List;
 import javax.annotation.Nullable;
@@ -38,6 +37,9 @@ public class IngestionConfig extends BaseJsonConfig {
   @JsonPropertyDescription("Config related to the stream data sources")
   private StreamIngestionConfig _streamIngestionConfig;
 
+  @JsonPropertyDescription("Configs to fix the data types of the source fields 
before applying other transforms")
+  private List<SourceFieldConfig> _sourceFieldConfigs;
+
   @JsonPropertyDescription("Config related to filtering records during 
ingestion")
   private FilterConfig _filterConfig;
 
@@ -51,16 +53,8 @@ public class IngestionConfig extends BaseJsonConfig {
   private ComplexTypeConfig _complexTypeConfig;
 
   @JsonPropertyDescription("Config related to the SchemaConformingTransformer")
-  @JsonProperty("schemaConformingTransformerConfig")
   private SchemaConformingTransformerConfig _schemaConformingTransformerConfig;
 
-  @JsonPropertyDescription("Config related to the 
SchemaConformingTransformerV2 (backward compatibility)")
-  @JsonProperty("schemaConformingTransformerV2Config")
-  public void setSchemaConformingTransformerV2Config(
-      SchemaConformingTransformerConfig schemaConformingTransformerConfig) {
-    _schemaConformingTransformerConfig = schemaConformingTransformerConfig;
-  }
-
   @JsonPropertyDescription("Configs related to record aggregation function 
applied during ingestion")
   private List<AggregationConfig> _aggregationConfigs;
 
@@ -84,26 +78,6 @@ public class IngestionConfig extends BaseJsonConfig {
   private int _ingestionExceptionLogRateLimitPerMin =
       
CommonConstants.IngestionConfigs.DEFAULT_INGESTION_EXCEPTION_LOG_RATE_LIMIT_PER_MIN;
 
-  @Deprecated
-  public IngestionConfig(@Nullable BatchIngestionConfig batchIngestionConfig,
-      @Nullable StreamIngestionConfig streamIngestionConfig, @Nullable 
FilterConfig filterConfig,
-      @Nullable List<EnrichmentConfig> enrichmentConfigs,
-      @Nullable List<TransformConfig> transformConfigs, @Nullable 
ComplexTypeConfig complexTypeConfig,
-      @Nullable SchemaConformingTransformerConfig 
schemaConformingTransformerConfig,
-      @Nullable List<AggregationConfig> aggregationConfigs) {
-    _batchIngestionConfig = batchIngestionConfig;
-    _streamIngestionConfig = streamIngestionConfig;
-    _filterConfig = filterConfig;
-    _enrichmentConfigs = enrichmentConfigs;
-    _transformConfigs = transformConfigs;
-    _complexTypeConfig = complexTypeConfig;
-    _schemaConformingTransformerConfig = schemaConformingTransformerConfig;
-    _aggregationConfigs = aggregationConfigs;
-  }
-
-  public IngestionConfig() {
-  }
-
   @Nullable
   public BatchIngestionConfig getBatchIngestionConfig() {
     return _batchIngestionConfig;
@@ -114,6 +88,11 @@ public class IngestionConfig extends BaseJsonConfig {
     return _streamIngestionConfig;
   }
 
+  @Nullable
+  public List<SourceFieldConfig> getSourceFieldConfigs() {
+    return _sourceFieldConfigs;
+  }
+
   @Nullable
   public FilterConfig getFilterConfig() {
     return _filterConfig;
@@ -176,6 +155,10 @@ public class IngestionConfig extends BaseJsonConfig {
     _streamIngestionConfig = streamIngestionConfig;
   }
 
+  public void setSourceFieldConfigs(List<SourceFieldConfig> 
sourceFieldConfigs) {
+    _sourceFieldConfigs = sourceFieldConfigs;
+  }
+
   public void setFilterConfig(FilterConfig filterConfig) {
     _filterConfig = filterConfig;
   }
@@ -197,6 +180,13 @@ public class IngestionConfig extends BaseJsonConfig {
     _schemaConformingTransformerConfig = schemaConformingTransformerConfig;
   }
 
+  /// For backward compatibility.
+  @SuppressWarnings("unused")
+  public void setSchemaConformingTransformerV2Config(
+      SchemaConformingTransformerConfig schemaConformingTransformerConfig) {
+    _schemaConformingTransformerConfig = schemaConformingTransformerConfig;
+  }
+
   public void setAggregationConfigs(List<AggregationConfig> 
aggregationConfigs) {
     _aggregationConfigs = aggregationConfigs;
   }
diff --git 
a/pinot-spi/src/main/java/org/apache/pinot/spi/config/table/ingestion/SourceFieldConfig.java
 
b/pinot-spi/src/main/java/org/apache/pinot/spi/config/table/ingestion/SourceFieldConfig.java
new file mode 100644
index 00000000000..34d4c4d2ac8
--- /dev/null
+++ 
b/pinot-spi/src/main/java/org/apache/pinot/spi/config/table/ingestion/SourceFieldConfig.java
@@ -0,0 +1,76 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.pinot.spi.config.table.ingestion;
+
+import com.fasterxml.jackson.annotation.JsonCreator;
+import com.fasterxml.jackson.annotation.JsonProperty;
+import com.fasterxml.jackson.annotation.JsonPropertyDescription;
+import com.google.common.base.Preconditions;
+import org.apache.commons.lang3.StringUtils;
+import org.apache.pinot.spi.config.BaseJsonConfig;
+import org.apache.pinot.spi.utils.PinotDataType;
+
+
+/// Configures a data type fix for a single source (input) field during 
ingestion, applied before other transformers
+/// consume the field. Useful when a source field arrives with a type that a 
downstream enricher or transform
+/// expression does not expect (e.g. an epoch timestamp arriving as a 
`String`).
+///
+/// Each config maps a source field ([#getName()]) to the target 
[PinotDataType] ([#getDataType()]) it should be
+/// converted to. The [#isPreComplexTypeTransform()] flag selects when the fix 
runs:
+/// - `true`: before the complex type transformation (and the pre-complex-type 
enrichers), so the corrected value can
+///   feed complex type flattening and pre-complex-type enrichment.
+/// - `false` (default): after the complex type transformation, so 
flattened/unnested fields can be fixed before the
+///   post-complex-type enrichers and the expression transformer run.
+///
+/// A source field may be configured at most once per phase.
+public class SourceFieldConfig extends BaseJsonConfig {
+  @JsonPropertyDescription("Name of the source field to fix the data type for")
+  private final String _name;
+
+  @JsonPropertyDescription("Target data type (PinotDataType name, e.g. INT, 
LONG, STRING, LONG_ARRAY) to convert "
+      + "the source field to")
+  private final PinotDataType _dataType;
+
+  @JsonPropertyDescription("Whether the data type conversion is applied before 
the complex type transformation")
+  private final boolean _preComplexTypeTransform;
+
+  @JsonCreator
+  public SourceFieldConfig(@JsonProperty(value = "name", required = true) 
String name,
+      @JsonProperty(value = "dataType", required = true) PinotDataType 
dataType,
+      @JsonProperty("preComplexTypeTransform") boolean 
preComplexTypeTransform) {
+    Preconditions.checkArgument(StringUtils.isNotBlank(name), "'name' must be 
set in SourceFieldConfig");
+    Preconditions.checkArgument(dataType != null, "'dataType' must be set in 
SourceFieldConfig for source field: %s",
+        name);
+    _name = name;
+    _dataType = dataType;
+    _preComplexTypeTransform = preComplexTypeTransform;
+  }
+
+  public String getName() {
+    return _name;
+  }
+
+  public PinotDataType getDataType() {
+    return _dataType;
+  }
+
+  public boolean isPreComplexTypeTransform() {
+    return _preComplexTypeTransform;
+  }
+}
diff --git 
a/pinot-spi/src/test/java/org/apache/pinot/spi/config/table/ingestion/SourceFieldConfigTest.java
 
b/pinot-spi/src/test/java/org/apache/pinot/spi/config/table/ingestion/SourceFieldConfigTest.java
new file mode 100644
index 00000000000..716c5dd7842
--- /dev/null
+++ 
b/pinot-spi/src/test/java/org/apache/pinot/spi/config/table/ingestion/SourceFieldConfigTest.java
@@ -0,0 +1,71 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.pinot.spi.config.table.ingestion;
+
+import com.fasterxml.jackson.core.JsonProcessingException;
+import org.apache.pinot.spi.utils.JsonUtils;
+import org.apache.pinot.spi.utils.PinotDataType;
+import org.testng.annotations.Test;
+
+import static org.testng.Assert.assertEquals;
+import static org.testng.Assert.assertFalse;
+import static org.testng.Assert.assertThrows;
+import static org.testng.Assert.assertTrue;
+
+
+public class SourceFieldConfigTest {
+
+  @Test
+  public void deserializesAllFields()
+      throws JsonProcessingException {
+    SourceFieldConfig config = JsonUtils.stringToObject(
+        "{\"name\": \"ts\", \"dataType\": \"LONG\", 
\"preComplexTypeTransform\": true}", SourceFieldConfig.class);
+    assertEquals(config.getName(), "ts");
+    assertEquals(config.getDataType(), PinotDataType.LONG);
+    assertTrue(config.isPreComplexTypeTransform());
+  }
+
+  @Test
+  public void preComplexTypeTransformDefaultsToFalse()
+      throws JsonProcessingException {
+    SourceFieldConfig config =
+        JsonUtils.stringToObject("{\"name\": \"tags\", \"dataType\": 
\"STRING_ARRAY\"}", SourceFieldConfig.class);
+    assertEquals(config.getDataType(), PinotDataType.STRING_ARRAY);
+    assertFalse(config.isPreComplexTypeTransform());
+  }
+
+  @Test
+  public void roundTripsThroughJson()
+      throws JsonProcessingException {
+    SourceFieldConfig config = new SourceFieldConfig("ts", PinotDataType.INT, 
false);
+    assertEquals(JsonUtils.stringToObject(config.toJsonString(), 
SourceFieldConfig.class), config);
+  }
+
+  @Test
+  public void rejectsMissingName() {
+    assertThrows(IllegalArgumentException.class, () -> new 
SourceFieldConfig(null, PinotDataType.INT, false));
+    assertThrows(IllegalArgumentException.class, () -> new 
SourceFieldConfig("", PinotDataType.INT, false));
+    assertThrows(IllegalArgumentException.class, () -> new SourceFieldConfig(" 
  ", PinotDataType.INT, false));
+  }
+
+  @Test
+  public void rejectsMissingDataType() {
+    assertThrows(IllegalArgumentException.class, () -> new 
SourceFieldConfig("ts", null, false));
+  }
+}


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to