Jackie-Jiang commented on a change in pull request #5597: URL: https://github.com/apache/incubator-pinot/pull/5597#discussion_r444446577
########## File path: pinot-core/src/main/java/org/apache/pinot/core/util/IngestionUtils.java ########## @@ -0,0 +1,93 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.core.util; + +import com.google.common.collect.Sets; +import java.util.Collections; +import java.util.HashSet; +import java.util.Set; +import javax.annotation.Nullable; +import org.apache.pinot.core.data.function.FunctionEvaluator; +import org.apache.pinot.core.data.function.FunctionEvaluatorFactory; +import org.apache.pinot.spi.config.table.IngestionConfig; +import org.apache.pinot.spi.data.FieldSpec; +import org.apache.pinot.spi.data.Schema; +import org.apache.pinot.spi.data.readers.GenericRow; + + +/** + * Utility methods for extracting source and destination fields from ingestion configs + */ +public class IngestionUtils { + + /** + * Extracts all fields required by the {@link org.apache.pinot.spi.data.readers.RecordExtractor} from the given TableConfig and Schema + * Fields for ingestion come from 2 places: + * 1. The schema + * 2. The ingestion config in the table config. The ingestion config (e.g. filter) can have fields which are not in the schema. + */ + public static Set<String> getFieldsForRecordExtractor(IngestionConfig ingestionConfig, Schema schema) { Review comment: (nit) annotate `ingestionConfig` as nullable ########## File path: pinot-core/src/main/java/org/apache/pinot/core/segment/creator/StatsCollectorConfig.java ########## @@ -35,16 +36,19 @@ public class StatsCollectorConfig { private final Schema _schema; + private final TableConfig _tableConfig; private final SegmentPartitionConfig _segmentPartitionConfig; /** * Constructor for the class. * @param schema Data schema * @param segmentPartitionConfig Segment partitioning config */ - public StatsCollectorConfig(@Nonnull Schema schema, SegmentPartitionConfig segmentPartitionConfig) { + public StatsCollectorConfig(Schema schema, TableConfig tableConfig, @Nullable SegmentPartitionConfig segmentPartitionConfig) { Review comment: (nit) Can we put table config in front of schema for consistency? ########## File path: pinot-core/src/main/java/org/apache/pinot/core/data/recordtransformer/FilterTransformer.java ########## @@ -0,0 +1,54 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.core.data.recordtransformer; + +import org.apache.pinot.core.data.function.FunctionEvaluator; +import org.apache.pinot.core.data.function.FunctionEvaluatorFactory; +import org.apache.pinot.spi.config.table.TableConfig; +import org.apache.pinot.spi.data.readers.GenericRow; + + +/** + * Based on filter config, decide whether to skip or allow this record. + * If record should be skipped, puts a special key in the record. + */ +public class FilterTransformer implements RecordTransformer { + + private FunctionEvaluator _evaluator = null; Review comment: (nit) Make it final ########## File path: pinot-spi/src/main/java/org/apache/pinot/spi/data/readers/GenericRow.java ########## @@ -49,7 +49,17 @@ */ public class GenericRow { + /** + * This key is used by a Decoder/RecordReader to handle 1 record to many records flattening. + * If a Decoder/RecordReader produces multiple GenericRows from the given record, they must be put into the destination GenericRow as a List<GenericRow> with this key + * The segment generation drivers handle this key as a special case and process the multiple records + */ public static final String MULTIPLE_RECORDS_KEY = "$MULTIPLE_RECORDS_KEY$"; + /** + * This key is used by the FilterTransformer to handle filtering out of records during ingestion + * The FilterTransformer puts this key into the GenericRow with value true, if the record matches the filtering out criteria, based on FilterConfig + */ + public static final String FILTER_RECORD_KEY = "$FILTER_RECORD_KEY$"; Review comment: Rename it to `SKIP_RECORD_KEY` for clarity? @mcvsubbu The `$` already denotes the field to be internal. I wouldn't recommend making the key too long because it will add cost for the hash lookup. ########## File path: pinot-core/src/test/java/org/apache/pinot/core/segment/index/creator/SegmentGenerationWithFilterRecordsKey.java ########## @@ -0,0 +1,112 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.core.segment.index.creator; + +import com.google.common.collect.Lists; +import java.io.File; +import java.util.ArrayList; +import java.util.List; +import org.apache.commons.io.FileUtils; +import org.apache.pinot.core.data.readers.GenericRowRecordReader; +import org.apache.pinot.core.data.readers.PinotSegmentRecordReader; +import org.apache.pinot.core.indexsegment.generator.SegmentGeneratorConfig; +import org.apache.pinot.core.segment.creator.impl.SegmentIndexCreationDriverImpl; +import org.apache.pinot.core.segment.index.metadata.SegmentMetadataImpl; +import org.apache.pinot.core.segment.store.SegmentDirectory; +import org.apache.pinot.spi.config.table.IngestionConfig; +import org.apache.pinot.spi.config.table.TableConfig; +import org.apache.pinot.spi.config.table.TableType; +import org.apache.pinot.spi.config.table.ingestion.FilterConfig; +import org.apache.pinot.spi.data.FieldSpec; +import org.apache.pinot.spi.data.Schema; +import org.apache.pinot.spi.data.readers.GenericRow; +import org.apache.pinot.spi.utils.builder.TableConfigBuilder; +import org.testng.Assert; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.BeforeMethod; +import org.testng.annotations.Test; + + +public class SegmentGenerationWithFilterRecordsKey { Review comment: Please rename this class to `*Test` or the framework cannot auto-detect this as a test ########## File path: pinot-core/src/main/java/org/apache/pinot/core/util/IngestionUtils.java ########## @@ -0,0 +1,93 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.core.util; + +import com.google.common.collect.Sets; +import java.util.Collections; +import java.util.HashSet; +import java.util.Set; +import javax.annotation.Nullable; +import org.apache.pinot.core.data.function.FunctionEvaluator; +import org.apache.pinot.core.data.function.FunctionEvaluatorFactory; +import org.apache.pinot.spi.config.table.IngestionConfig; +import org.apache.pinot.spi.data.FieldSpec; +import org.apache.pinot.spi.data.Schema; +import org.apache.pinot.spi.data.readers.GenericRow; + + +/** + * Utility methods for extracting source and destination fields from ingestion configs + */ +public class IngestionUtils { + + /** + * Extracts all fields required by the {@link org.apache.pinot.spi.data.readers.RecordExtractor} from the given TableConfig and Schema + * Fields for ingestion come from 2 places: + * 1. The schema + * 2. The ingestion config in the table config. The ingestion config (e.g. filter) can have fields which are not in the schema. + */ + public static Set<String> getFieldsForRecordExtractor(IngestionConfig ingestionConfig, Schema schema) { + Set<String> fieldsForRecordExtractor = new HashSet<>(); + fieldsForRecordExtractor.addAll(getFieldsFromIngestionConfig(ingestionConfig)); Review comment: Recommend passing the set into the helper method to prevent creating multiple sets. ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: [email protected] --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
