steveloughran commented on code in PR #3481:
URL: https://github.com/apache/parquet-java/pull/3481#discussion_r3088101755
##########
parquet-variant/src/main/java/org/apache/parquet/variant/Variant.java:
##########
@@ -324,23 +333,56 @@ public int numArrayElements() {
* @throws IllegalArgumentException if `getType()` does not return
`Type.ARRAY`
*/
public Variant getElementAtIndex(int index) {
- VariantUtil.ArrayInfo info = VariantUtil.getArrayInfo(value);
+ VariantUtil.ArrayInfo info = arrayInfo();
if (index < 0 || index >= info.numElements) {
return null;
}
- return getElementAtIndex(
- index,
- value,
- metadata,
- info.offsetSize,
- value.position() + info.offsetStartOffset,
- value.position() + info.dataStartOffset);
+ int offsetStart = value.position() + info.offsetStartOffset;
+ int dataStart = value.position() + info.dataStartOffset;
+ int offset = VariantUtil.readUnsigned(value, offsetStart + info.offsetSize
* index, info.offsetSize);
+ return childVariant(VariantUtil.slice(value, dataStart + offset));
+ }
+
+ /**
+ * Creates a child Variant that shares this instance's metadata cache.
+ */
+ private Variant childVariant(ByteBuffer childValue) {
+ return new Variant(childValue, metadata, metadataCache, dictSize);
}
- private static Variant getElementAtIndex(
- int index, ByteBuffer value, ByteBuffer metadata, int offsetSize, int
offsetStart, int dataStart) {
- // offsetStart and dataStart are absolute positions in the `value` buffer.
- int offset = VariantUtil.readUnsigned(value, offsetStart + offsetSize *
index, offsetSize);
- return new Variant(VariantUtil.slice(value, dataStart + offset), metadata);
+ /**
+ * Returns the metadata dictionary string for the given ID, caching the
result.
+ */
+ String getMetadataKeyCached(int id) {
+ if (metadataCache == null) {
Review Comment:
can you add some comments here, e.g
// demand create shared dictionary cache
##########
parquet-variant/src/main/java/org/apache/parquet/variant/Variant.java:
##########
@@ -324,23 +333,56 @@ public int numArrayElements() {
* @throws IllegalArgumentException if `getType()` does not return
`Type.ARRAY`
*/
public Variant getElementAtIndex(int index) {
- VariantUtil.ArrayInfo info = VariantUtil.getArrayInfo(value);
+ VariantUtil.ArrayInfo info = arrayInfo();
if (index < 0 || index >= info.numElements) {
return null;
}
- return getElementAtIndex(
- index,
- value,
- metadata,
- info.offsetSize,
- value.position() + info.offsetStartOffset,
- value.position() + info.dataStartOffset);
+ int offsetStart = value.position() + info.offsetStartOffset;
+ int dataStart = value.position() + info.dataStartOffset;
+ int offset = VariantUtil.readUnsigned(value, offsetStart + info.offsetSize
* index, info.offsetSize);
+ return childVariant(VariantUtil.slice(value, dataStart + offset));
+ }
+
+ /**
+ * Creates a child Variant that shares this instance's metadata cache.
+ */
+ private Variant childVariant(ByteBuffer childValue) {
+ return new Variant(childValue, metadata, metadataCache, dictSize);
}
- private static Variant getElementAtIndex(
- int index, ByteBuffer value, ByteBuffer metadata, int offsetSize, int
offsetStart, int dataStart) {
- // offsetStart and dataStart are absolute positions in the `value` buffer.
- int offset = VariantUtil.readUnsigned(value, offsetStart + offsetSize *
index, offsetSize);
- return new Variant(VariantUtil.slice(value, dataStart + offset), metadata);
+ /**
+ * Returns the metadata dictionary string for the given ID, caching the
result.
+ */
+ String getMetadataKeyCached(int id) {
+ if (metadataCache == null) {
+ metadataCache = new String[dictSize];
+ }
+ if (id < 0 || id >= dictSize) {
Review Comment:
definitely explain this.
##########
parquet-variant/src/main/java/org/apache/parquet/variant/Variant.java:
##########
@@ -206,22 +246,18 @@ public int numObjectElements() {
* @throws IllegalArgumentException if `getType()` does not return
`Type.OBJECT`
*/
public Variant getFieldByKey(String key) {
- VariantUtil.ObjectInfo info = VariantUtil.getObjectInfo(value);
- // Use linear search for a short list. Switch to binary search when the
length reaches
- // `BINARY_SEARCH_THRESHOLD`.
+ VariantUtil.ObjectInfo info = objectInfo();
+ int idStart = value.position() + info.idStartOffset;
+ int offsetStart = value.position() + info.offsetStartOffset;
+ int dataStart = value.position() + info.dataStartOffset;
+
if (info.numElements < BINARY_SEARCH_THRESHOLD) {
for (int i = 0; i < info.numElements; ++i) {
- ObjectField field = getFieldAtIndex(
- i,
- value,
- metadata,
- info.idSize,
- info.offsetSize,
- value.position() + info.idStartOffset,
- value.position() + info.offsetStartOffset,
- value.position() + info.dataStartOffset);
- if (field.key.equals(key)) {
- return field.value;
+ int id = VariantUtil.readUnsigned(value, idStart + info.idSize * i,
info.idSize);
Review Comment:
readUnsigned will need looking at too; the iceberg code looks better. Later
PR
##########
parquet-variant/src/main/java/org/apache/parquet/variant/Variant.java:
##########
@@ -67,6 +87,26 @@ public Variant(ByteBuffer value, ByteBuffer metadata) {
"Unsupported variant metadata version: %d",
metadata.get(metadata.position()) & VariantUtil.VERSION_MASK));
}
+
+ // Pre-compute dictionary size for lazy metadata cache allocation.
+ int pos = this.metadata.position();
+ int metaOffsetSize = ((this.metadata.get(pos) >> 6) & 0x3) + 1;
+ if (this.metadata.remaining() > 1) {
+ this.dictSize = VariantUtil.readUnsigned(this.metadata, pos + 1,
metaOffsetSize);
+ } else {
+ this.dictSize = 0;
Review Comment:
I didn't benchmark empty variants, did I? Not a case we need to worry much
about though
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]