Jackie-Jiang commented on code in PR #11739: URL: https://github.com/apache/pinot/pull/11739#discussion_r1364551387
########## pinot-core/src/main/java/org/apache/pinot/core/operator/transform/function/JsonExtractIndexTransformFunction.java: ########## @@ -0,0 +1,256 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.core.operator.transform.function; + +import java.math.BigDecimal; +import java.util.List; +import java.util.Map; +import org.apache.pinot.core.operator.ColumnContext; +import org.apache.pinot.core.operator.blocks.ValueBlock; +import org.apache.pinot.core.operator.transform.TransformResultMetadata; +import org.apache.pinot.segment.spi.index.reader.JsonIndexReader; +import org.apache.pinot.segment.spi.index.reader.JsonIndexReaderContext; +import org.apache.pinot.spi.data.FieldSpec.DataType; + + +/** + * The <code>JsonExtractIndexTransformFunction</code> provides the same behavior as JsonExtractScalar, with the + * implementation changed to read values from the JSON index. For large JSON blobs this can be faster than parsing + * GBs of JSON at query time. For small JSON blobs/highly filtered input this is generally slower than the *scalar + * implementation. The inflection point is highly dependent on the number of docs remaining post filter. + */ +public class JsonExtractIndexTransformFunction extends BaseTransformFunction { + public static final String FUNCTION_NAME = "jsonExtractIndex"; + + private TransformFunction _jsonFieldTransformFunction; + private String _jsonPathString; + private TransformResultMetadata _resultMetadata; + private JsonIndexReader _jsonIndexReader; + private Object _defaultValue; + private JsonIndexReaderContext _jsonIndexReaderContext; + + @Override + public String getName() { + return FUNCTION_NAME; + } + + @Override + public void init(List<TransformFunction> arguments, Map<String, ColumnContext> columnContextMap) { + // Check that there are exactly 3 or 4 arguments + if (arguments.size() < 3 || arguments.size() > 4) { + throw new IllegalArgumentException( + "Expected 3/4 arguments for transform function: jsonExtractIndex(jsonFieldName, 'jsonPath', 'resultsType'," + + " ['defaultValue'])"); + } + + TransformFunction firstArgument = arguments.get(0); + if (firstArgument instanceof LiteralTransformFunction || !firstArgument.getResultMetadata().isSingleValue()) { + throw new IllegalArgumentException( + "The first argument of jsonExtractIndex transform function must be a single-valued column or a transform " + + "function"); + } + if (firstArgument instanceof IdentifierTransformFunction) { + String columnName = ((IdentifierTransformFunction) firstArgument).getColumnName(); + if (columnContextMap.containsKey(columnName)) { Review Comment: (minor) This check is redundant. Identifier is guaranteed to be in the `columnContextMap` ########## pinot-core/src/main/java/org/apache/pinot/core/operator/transform/function/JsonExtractIndexTransformFunction.java: ########## @@ -0,0 +1,256 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.core.operator.transform.function; + +import java.math.BigDecimal; +import java.util.List; +import java.util.Map; +import org.apache.pinot.core.operator.ColumnContext; +import org.apache.pinot.core.operator.blocks.ValueBlock; +import org.apache.pinot.core.operator.transform.TransformResultMetadata; +import org.apache.pinot.segment.spi.index.reader.JsonIndexReader; +import org.apache.pinot.segment.spi.index.reader.JsonIndexReaderContext; +import org.apache.pinot.spi.data.FieldSpec.DataType; + + +/** + * The <code>JsonExtractIndexTransformFunction</code> provides the same behavior as JsonExtractScalar, with the + * implementation changed to read values from the JSON index. For large JSON blobs this can be faster than parsing + * GBs of JSON at query time. For small JSON blobs/highly filtered input this is generally slower than the *scalar + * implementation. The inflection point is highly dependent on the number of docs remaining post filter. + */ +public class JsonExtractIndexTransformFunction extends BaseTransformFunction { + public static final String FUNCTION_NAME = "jsonExtractIndex"; + + private TransformFunction _jsonFieldTransformFunction; + private String _jsonPathString; + private TransformResultMetadata _resultMetadata; + private JsonIndexReader _jsonIndexReader; + private Object _defaultValue; + private JsonIndexReaderContext _jsonIndexReaderContext; + + @Override + public String getName() { + return FUNCTION_NAME; + } + + @Override + public void init(List<TransformFunction> arguments, Map<String, ColumnContext> columnContextMap) { + // Check that there are exactly 3 or 4 arguments + if (arguments.size() < 3 || arguments.size() > 4) { + throw new IllegalArgumentException( + "Expected 3/4 arguments for transform function: jsonExtractIndex(jsonFieldName, 'jsonPath', 'resultsType'," + + " ['defaultValue'])"); + } + + TransformFunction firstArgument = arguments.get(0); + if (firstArgument instanceof LiteralTransformFunction || !firstArgument.getResultMetadata().isSingleValue()) { + throw new IllegalArgumentException( + "The first argument of jsonExtractIndex transform function must be a single-valued column or a transform " + + "function"); + } Review Comment: This is redundant. Checking if the first argument is identifier, and if it contains JSON index is good enough ########## pinot-core/src/main/java/org/apache/pinot/core/operator/transform/function/JsonExtractIndexTransformFunction.java: ########## @@ -0,0 +1,256 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.core.operator.transform.function; + +import java.math.BigDecimal; +import java.util.List; +import java.util.Map; +import org.apache.pinot.core.operator.ColumnContext; +import org.apache.pinot.core.operator.blocks.ValueBlock; +import org.apache.pinot.core.operator.transform.TransformResultMetadata; +import org.apache.pinot.segment.spi.index.reader.JsonIndexReader; +import org.apache.pinot.segment.spi.index.reader.JsonIndexReaderContext; +import org.apache.pinot.spi.data.FieldSpec.DataType; + + +/** + * The <code>JsonExtractIndexTransformFunction</code> provides the same behavior as JsonExtractScalar, with the + * implementation changed to read values from the JSON index. For large JSON blobs this can be faster than parsing + * GBs of JSON at query time. For small JSON blobs/highly filtered input this is generally slower than the *scalar + * implementation. The inflection point is highly dependent on the number of docs remaining post filter. + */ +public class JsonExtractIndexTransformFunction extends BaseTransformFunction { + public static final String FUNCTION_NAME = "jsonExtractIndex"; + + private TransformFunction _jsonFieldTransformFunction; + private String _jsonPathString; + private TransformResultMetadata _resultMetadata; + private JsonIndexReader _jsonIndexReader; + private Object _defaultValue; + private JsonIndexReaderContext _jsonIndexReaderContext; + + @Override + public String getName() { + return FUNCTION_NAME; + } + + @Override + public void init(List<TransformFunction> arguments, Map<String, ColumnContext> columnContextMap) { + // Check that there are exactly 3 or 4 arguments + if (arguments.size() < 3 || arguments.size() > 4) { + throw new IllegalArgumentException( + "Expected 3/4 arguments for transform function: jsonExtractIndex(jsonFieldName, 'jsonPath', 'resultsType'," + + " ['defaultValue'])"); + } + + TransformFunction firstArgument = arguments.get(0); + if (firstArgument instanceof LiteralTransformFunction || !firstArgument.getResultMetadata().isSingleValue()) { + throw new IllegalArgumentException( + "The first argument of jsonExtractIndex transform function must be a single-valued column or a transform " + + "function"); + } + if (firstArgument instanceof IdentifierTransformFunction) { + String columnName = ((IdentifierTransformFunction) firstArgument).getColumnName(); + if (columnContextMap.containsKey(columnName)) { + _jsonIndexReader = columnContextMap.get(columnName).getDataSource().getJsonIndex(); + if (_jsonIndexReader == null) { + throw new UnsupportedOperationException("jsonExtractIndex can only be applied on a column with JSON index"); Review Comment: Let's throw `IllegalStateException` ########## pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/json/ImmutableJsonIndexReader.java: ########## @@ -300,6 +305,92 @@ private int getDocId(int flattenedDocId) { return _docIdMapping.getInt((long) flattenedDocId << 2); } + /** + * ImmutableJsonIndexReaderContext holds a cache that is used to accelerate following reads. The cache is specific + * to a key, and therefore should NOT be reused for other keys. For each key, ImmutableJsonIndexReader.createContext() + * should be invoked to create a fresh context. + */ + public static class ImmutableJsonIndexReaderContext implements JsonIndexReaderContext { + private final Int2ObjectOpenHashMap<RoaringBitmap> _cache; + + public ImmutableJsonIndexReaderContext() { + _cache = new Int2ObjectOpenHashMap<>(); + } + + public Int2ObjectOpenHashMap<RoaringBitmap> getCache() { + return _cache; + } + } + + @Override + public ImmutableJsonIndexReaderContext createContext() { + return new ImmutableJsonIndexReaderContext(); + } + + @Override + public String[] getValuesForKeyAndDocs(String key, int[] docIds, JsonIndexReaderContext context) { + ImmutableJsonIndexReaderContext immutableContext = (ImmutableJsonIndexReaderContext) context; + Int2ObjectOpenHashMap<RoaringBitmap> cache = immutableContext.getCache(); + + RoaringBitmap docIdMask = RoaringBitmap.bitmapOf(docIds); + int[] dictIds = getDictIdRangeForKey(key); + Int2ObjectOpenHashMap<String> docIdToValues = new Int2ObjectOpenHashMap<>(docIds.length); + + if (cache.isEmpty()) { + for (int dictId = dictIds[0]; dictId < dictIds[1]; dictId++) { Review Comment: Seems we don't even need a map. A list/array of bitmaps should be good enough. Accessing list/array should be much faster than map lookup ########## pinot-core/src/main/java/org/apache/pinot/core/operator/transform/function/JsonExtractIndexTransformFunction.java: ########## @@ -0,0 +1,256 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.core.operator.transform.function; + +import java.math.BigDecimal; +import java.util.List; +import java.util.Map; +import org.apache.pinot.core.operator.ColumnContext; +import org.apache.pinot.core.operator.blocks.ValueBlock; +import org.apache.pinot.core.operator.transform.TransformResultMetadata; +import org.apache.pinot.segment.spi.index.reader.JsonIndexReader; +import org.apache.pinot.segment.spi.index.reader.JsonIndexReaderContext; +import org.apache.pinot.spi.data.FieldSpec.DataType; + + +/** + * The <code>JsonExtractIndexTransformFunction</code> provides the same behavior as JsonExtractScalar, with the + * implementation changed to read values from the JSON index. For large JSON blobs this can be faster than parsing + * GBs of JSON at query time. For small JSON blobs/highly filtered input this is generally slower than the *scalar + * implementation. The inflection point is highly dependent on the number of docs remaining post filter. + */ +public class JsonExtractIndexTransformFunction extends BaseTransformFunction { + public static final String FUNCTION_NAME = "jsonExtractIndex"; + + private TransformFunction _jsonFieldTransformFunction; + private String _jsonPathString; + private TransformResultMetadata _resultMetadata; + private JsonIndexReader _jsonIndexReader; + private Object _defaultValue; + private JsonIndexReaderContext _jsonIndexReaderContext; + + @Override + public String getName() { + return FUNCTION_NAME; + } + + @Override + public void init(List<TransformFunction> arguments, Map<String, ColumnContext> columnContextMap) { + // Check that there are exactly 3 or 4 arguments + if (arguments.size() < 3 || arguments.size() > 4) { + throw new IllegalArgumentException( + "Expected 3/4 arguments for transform function: jsonExtractIndex(jsonFieldName, 'jsonPath', 'resultsType'," + + " ['defaultValue'])"); + } + + TransformFunction firstArgument = arguments.get(0); + if (firstArgument instanceof LiteralTransformFunction || !firstArgument.getResultMetadata().isSingleValue()) { + throw new IllegalArgumentException( + "The first argument of jsonExtractIndex transform function must be a single-valued column or a transform " + + "function"); + } + if (firstArgument instanceof IdentifierTransformFunction) { + String columnName = ((IdentifierTransformFunction) firstArgument).getColumnName(); + if (columnContextMap.containsKey(columnName)) { + _jsonIndexReader = columnContextMap.get(columnName).getDataSource().getJsonIndex(); + if (_jsonIndexReader == null) { + throw new UnsupportedOperationException("jsonExtractIndex can only be applied on a column with JSON index"); + } + } + } else { + throw new IllegalArgumentException("jsonExtractIndex can only be applied to a raw column"); + } + _jsonFieldTransformFunction = firstArgument; + _jsonPathString = ((LiteralTransformFunction) arguments.get(1)).getStringLiteral().substring(1); // remove $ prefix Review Comment: We want to check the type first and throw proper exception. Currently it will throw casting exception which is not very user friendly. Same for other arguments ########## pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/json/ImmutableJsonIndexReader.java: ########## @@ -300,6 +305,92 @@ private int getDocId(int flattenedDocId) { return _docIdMapping.getInt((long) flattenedDocId << 2); } + /** + * ImmutableJsonIndexReaderContext holds a cache that is used to accelerate following reads. The cache is specific + * to a key, and therefore should NOT be reused for other keys. For each key, ImmutableJsonIndexReader.createContext() + * should be invoked to create a fresh context. + */ + public static class ImmutableJsonIndexReaderContext implements JsonIndexReaderContext { + private final Int2ObjectOpenHashMap<RoaringBitmap> _cache; + + public ImmutableJsonIndexReaderContext() { + _cache = new Int2ObjectOpenHashMap<>(); + } + + public Int2ObjectOpenHashMap<RoaringBitmap> getCache() { + return _cache; + } + } + + @Override + public ImmutableJsonIndexReaderContext createContext() { + return new ImmutableJsonIndexReaderContext(); + } + + @Override + public String[] getValuesForKeyAndDocs(String key, int[] docIds, JsonIndexReaderContext context) { Review Comment: I wouldn't model it as a context class since it is not sharable across keys. Suggest adding a separate method `public Int2ObjectOpenHashMap<RoaringBitmap> getMatchingDocsMap(String key)`, then pass this map into this method -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected] --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
