Jackie-Jiang commented on code in PR #18325:
URL: https://github.com/apache/pinot/pull/18325#discussion_r3143006156


##########
pinot-plugins/pinot-input-format/pinot-parquet/src/main/java/org/apache/pinot/plugin/inputformat/parquet/ParquetNativeRecordExtractor.java:
##########
@@ -173,26 +176,101 @@ public static long convertInt96ToLong(byte[] int96Bytes) 
{
   }
 
   public Object[] extractList(Group group) {
-    int numValues = group.getType().getFieldCount();
-    Object[] array = new Object[numValues];
-    for (int i = 0; i < numValues; i++) {
-      array[i] = extractValue(group, i);
-    }
-    if (numValues == 1 && array[0] == null) {
-      return new Object[0];
-    }
-    if (numValues == 1 && array[0] instanceof Object[]) {
-      return (Object[]) array[0];
+    // Group is annotated with LIST. Per the Parquet LogicalTypes spec it 
always has exactly one child, the
+    // repeated wrapper. The wrapper's schema (not the row data) decides which 
encoding we are reading, so we
+    // resolve that once here and dispatch the whole list down a single branch.
+    String parentListName = group.getType().getName();
+    Type repeatedField = group.getType().getType(0);
+    int numValues = group.getFieldRepetitionCount(0);
+    Object[] values = new Object[numValues];
+    if (isListElementWrapper(repeatedField, parentListName)) {
+      // Standard 3-level LIST encoding (modern Parquet writers, parquet-avro, 
Spark, Arrow, …):
+      //   <list-rep> group <name> (LIST) {
+      //     repeated group list {            // wrapper carries only 
repetition
+      //       <elem-rep> <elem-type> element;
+      //     }
+      //   }
+      // Per the Parquet LogicalTypes backward-compat rules this also covers 
any single-field repeated group whose
+      // wrapper field is NOT named `array` or `<list>_tuple` (rule 4) — the 
inner field IS the element, regardless
+      // of its name. Strip the wrapper so each row surfaces the bare element 
value, matching Apache Arrow /
+      // parquet-avro (with `parquet.avro.add-list-element-records=false`).
+      for (int i = 0; i < numValues; i++) {
+        values[i] = extractValue(group.getGroup(0, i), 0);
+      }
+    } else {
+      // Legacy non-wrapper repeated forms:
+      //   * Repeated primitive: `repeated <elem-type> <name>;` (rule 1) — 
`repeatedField.isPrimitive()`.
+      //   * Repeated multi-field group: `repeated group <name> { <fields…> 
};` (rule 2) — the group IS the element.
+      //   * Repeated single-field group named `array` or `<list>_tuple` (rule 
3) — the group IS the element,
+      //     preserved as a struct/Map.
+      // In all of these the repeated field is the element, so we extract it 
directly without unwrapping.
+      for (int i = 0; i < numValues; i++) {
+        values[i] = extractValue(group, 0, repeatedField, i);
+      }
     }
-    return array;
+    return values;
   }
 
   public Map<String, Object> extractMap(Group group) {
+    // Plain Parquet group (no LIST/MAP annotation) — surfaces as a Map keyed 
by child field name.
+    // Reached for nested struct fields and for groups read with the 
legacy/un-annotated branch.
     int numValues = group.getType().getFieldCount();
     Map<String, Object> map = Maps.newHashMapWithExpectedSize(numValues);
     for (int i = 0; i < numValues; i++) {
       map.put(group.getType().getType(i).getName(), extractValue(group, i));
     }
     return map;
   }
+
+  private Map<String, Object> extractKeyValueMap(Group group) {
+    // Group is annotated with MAP. Per the Parquet LogicalTypes spec it 
always has exactly one child — a repeated
+    // group with two fields named "key" and "value":
+    //   <map-repetition> group <name> (MAP) {
+    //     repeated group key_value {
+    //       required <key-type>   key;
+    //       <value-repetition> <value-type> value;
+    //     }
+    //   }
+    // The repeated wrapper name (`key_value` / `map`) and field order vary 
across writers, so we resolve the
+    // key/value field indices from the schema once and reuse them for every 
entry.
+    //
+    // NOTE: Parquet does NOT guarantee that MAP entries are returned in any 
particular order on read — neither
+    // sorted nor in insertion order. Writers, page boundaries, and dictionary 
encodings can all reorder entries.
+    // If you need a stable order, write the data as a LIST of STRUCT<key, 
value> instead of using the native MAP
+    // logical type. We therefore use a plain HashMap here and make no 
ordering promise to downstream consumers.
+    int numValues = group.getFieldRepetitionCount(0);
+    if (numValues == 0) {
+      return Map.of();
+    }
+    GroupType keyValueType = group.getType().getType(0).asGroupType();
+    int keyIndex = keyValueType.getFieldIndex("key");
+    int valueIndex = keyValueType.getFieldIndex("value");
+    Map<String, Object> map = Maps.newHashMapWithExpectedSize(numValues);
+    for (int i = 0; i < numValues; i++) {
+      Group keyValueGroup = group.getGroup(0, i);
+      Object key = extractValue(keyValueGroup, keyIndex);
+      Object value = extractValue(keyValueGroup, valueIndex);
+      map.put(key.toString(), value);
+    }
+    return map;
+  }
+
+  private boolean isListElementWrapper(Type repeatedField, String 
parentListName) {

Review Comment:
   Rename this method



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to