Re: [PR] Api: Support variant extract and fix manifest bounds byte order [iceberg]

via GitHub Tue, 14 Apr 2026 13:33:21 -0700


steveloughran commented on code in PR #15384:
URL: https://github.com/apache/iceberg/pull/15384#discussion_r3082113035



##########
api/src/main/java/org/apache/iceberg/expressions/PathUtil.java:
##########
@@ -27,60 +27,255 @@
 import java.util.stream.Collectors;
 import 
org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting;
 import org.apache.iceberg.relocated.com.google.common.base.Preconditions;
-import org.apache.iceberg.relocated.com.google.common.base.Splitter;
 import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap;
+import org.apache.iceberg.relocated.com.google.common.collect.Lists;
 import org.apache.iceberg.relocated.com.google.common.collect.Streams;
 
 public class PathUtil {
   private PathUtil() {}
 
+  /**
+   * One step in a variant JSONPath: an object member name or a zero-based 
array index (RFC 9535
+   * {@code [n]} selector).
+   */
+  sealed interface PathSegment permits PathSegment.Name, PathSegment.Index {
+    record Name(String name) implements PathSegment {}
+
+    record Index(int index) implements PathSegment {}
+  }
+
   private static final String RFC9535_NAME_FIRST =
       "[A-Za-z_\\x{0080}-\\x{D7FF}\\x{E000}-\\x{10FFFF}]";
   private static final String RFC9535_NAME_CHARS =
       "[0-9A-Za-z_\\x{0080}-\\x{D7FF}\\x{E000}-\\x{10FFFF}]*";
   private static final Predicate<String> RFC9535_MEMBER_NAME_SHORTHAND =
       Pattern.compile(RFC9535_NAME_FIRST + 
RFC9535_NAME_CHARS).asMatchPredicate();
 
+  /** Letters that follow {@code \} for control-character escapes in RFC 9535 
quoted segments. */
+  private static final String RFC9535_SIMPLE_ESCAPE_LETTERS = "btnfr";
+
+  private static final String RFC9535_SIMPLE_ESCAPE_CHARS = "\b\t\n\f\r";
+
   private static final Pattern RFC9535_REQUIRES_ESCAPE =
       Pattern.compile(
           
"[^\\x{0020}-\\x{0026}\\x{0028}-\\x{005B}\\x{005D}-\\x{D7FF}\\x{E000}-\\x{10FFFF}]");
 
+  /**
+   * Matches one bracket segment {@code ['...']} where inner text may contain 
RFC 9535 escapes
+   * (quote, backslash, control characters, and four-digit hex escapes).
+   */
+  private static final Pattern BRACKET_SEGMENT = 
Pattern.compile("\\['((?:[^'\\\\]|\\\\.)*)'\\]");
+
   private static final Map<Character, String> RFC9535_ESCAPE_REPLACEMENTS = 
buildReplacementMap();
 
-  private static final Splitter DOT = Splitter.on(".");
   private static final String ROOT = "$";
 
-  static List<String> parse(String path) {
+  /**
+   * Parses a path into segments. After the root {@code $}, each segment is 
either dot shorthand
+   * ({@code .name} per RFC 9535), a single-quoted bracket name ({@code 
['...']}) with RFC 9535
+   * escapes, or a numeric array index ({@code [n]}). Forms may be mixed (e.g. 
{@code $.a['b.c']},
+   * {@code $.items[0].tags}, {@code $.matrix[0][1]}). Wildcards and recursive 
descent are not
+   * supported.
+   *
+   * <p>The root path {@code $} yields an empty segment list.
+   */
+  static List<PathSegment> parse(String path) {
     Preconditions.checkArgument(path != null, "Invalid path: null");
+    Preconditions.checkArgument(!path.isEmpty(), "Invalid path: empty");
+    Preconditions.checkArgument(
+        path.startsWith(ROOT), "Invalid path, does not start with %s: %s", 
ROOT, path);
+
+    if (path.equals(ROOT)) {
+      return Lists.newArrayList();
+    }
+
+    return parseAfterRoot(path);
+  }
+
+  /** Normalizes object field names only (no array indices). */
+  public static String toNormalizedPath(Iterable<String> fields) {
+    return toNormalizedPath(
+        
Streams.stream(fields).map(PathSegment.Name::new).collect(Collectors.toList()));
+  }
+
+  static String toNormalizedPath(List<PathSegment> segments) {
+    StringBuilder builder = new StringBuilder(ROOT);
+    for (PathSegment segment : segments) {
+      if (segment instanceof PathSegment.Name) {
+        String name = ((PathSegment.Name) segment).name();
+        builder.append("['").append(rfc9535escape(name)).append("']");
+      } else if (segment instanceof PathSegment.Index) {
+        int index = ((PathSegment.Index) segment).index();
+        Preconditions.checkArgument(index >= 0, "Invalid path, negative array 
index: %s", index);
+        builder.append('[').append(index).append(']');
+      } else {
+        throw new IllegalStateException("Unknown segment: " + segment);
+      }
+    }
+    return builder.toString();
+  }
+
+  @SuppressWarnings("StatementSwitchToExpressionSwitch")
+  private static List<PathSegment> parseAfterRoot(String path) {
+    List<PathSegment> segments = Lists.newArrayList();
+    Matcher bracketMatcher = BRACKET_SEGMENT.matcher(path);
+    int len = path.length();
+    int pos = ROOT.length();
+
+    while (pos < len) {
+      char ch = path.charAt(pos);
+      switch (ch) {
+        case '.':

Review Comment:
   use the java17 style 
   
   ```
   pos = switch(ch) {
     case '.' -> appendDotSegment()
     ... etc
   }
   ```
   stops the compiler complaining about not using it



##########
api/src/main/java/org/apache/iceberg/expressions/PathUtil.java:
##########
@@ -27,60 +27,255 @@
 import java.util.stream.Collectors;
 import 
org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting;
 import org.apache.iceberg.relocated.com.google.common.base.Preconditions;
-import org.apache.iceberg.relocated.com.google.common.base.Splitter;
 import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap;
+import org.apache.iceberg.relocated.com.google.common.collect.Lists;
 import org.apache.iceberg.relocated.com.google.common.collect.Streams;
 
 public class PathUtil {
   private PathUtil() {}
 
+  /**
+   * One step in a variant JSONPath: an object member name or a zero-based 
array index (RFC 9535
+   * {@code [n]} selector).
+   */
+  sealed interface PathSegment permits PathSegment.Name, PathSegment.Index {
+    record Name(String name) implements PathSegment {}
+
+    record Index(int index) implements PathSegment {}
+  }
+
   private static final String RFC9535_NAME_FIRST =
       "[A-Za-z_\\x{0080}-\\x{D7FF}\\x{E000}-\\x{10FFFF}]";
   private static final String RFC9535_NAME_CHARS =
       "[0-9A-Za-z_\\x{0080}-\\x{D7FF}\\x{E000}-\\x{10FFFF}]*";
   private static final Predicate<String> RFC9535_MEMBER_NAME_SHORTHAND =
       Pattern.compile(RFC9535_NAME_FIRST + 
RFC9535_NAME_CHARS).asMatchPredicate();
 
+  /** Letters that follow {@code \} for control-character escapes in RFC 9535 
quoted segments. */
+  private static final String RFC9535_SIMPLE_ESCAPE_LETTERS = "btnfr";
+
+  private static final String RFC9535_SIMPLE_ESCAPE_CHARS = "\b\t\n\f\r";
+
   private static final Pattern RFC9535_REQUIRES_ESCAPE =
       Pattern.compile(
           
"[^\\x{0020}-\\x{0026}\\x{0028}-\\x{005B}\\x{005D}-\\x{D7FF}\\x{E000}-\\x{10FFFF}]");
 
+  /**
+   * Matches one bracket segment {@code ['...']} where inner text may contain 
RFC 9535 escapes
+   * (quote, backslash, control characters, and four-digit hex escapes).
+   */
+  private static final Pattern BRACKET_SEGMENT = 
Pattern.compile("\\['((?:[^'\\\\]|\\\\.)*)'\\]");
+
   private static final Map<Character, String> RFC9535_ESCAPE_REPLACEMENTS = 
buildReplacementMap();
 
-  private static final Splitter DOT = Splitter.on(".");
   private static final String ROOT = "$";
 
-  static List<String> parse(String path) {
+  /**
+   * Parses a path into segments. After the root {@code $}, each segment is 
either dot shorthand
+   * ({@code .name} per RFC 9535), a single-quoted bracket name ({@code 
['...']}) with RFC 9535
+   * escapes, or a numeric array index ({@code [n]}). Forms may be mixed (e.g. 
{@code $.a['b.c']},
+   * {@code $.items[0].tags}, {@code $.matrix[0][1]}). Wildcards and recursive 
descent are not
+   * supported.
+   *
+   * <p>The root path {@code $} yields an empty segment list.
+   */
+  static List<PathSegment> parse(String path) {
     Preconditions.checkArgument(path != null, "Invalid path: null");
+    Preconditions.checkArgument(!path.isEmpty(), "Invalid path: empty");
+    Preconditions.checkArgument(
+        path.startsWith(ROOT), "Invalid path, does not start with %s: %s", 
ROOT, path);
+
+    if (path.equals(ROOT)) {
+      return Lists.newArrayList();
+    }
+
+    return parseAfterRoot(path);
+  }
+
+  /** Normalizes object field names only (no array indices). */
+  public static String toNormalizedPath(Iterable<String> fields) {
+    return toNormalizedPath(
+        
Streams.stream(fields).map(PathSegment.Name::new).collect(Collectors.toList()));
+  }
+
+  static String toNormalizedPath(List<PathSegment> segments) {
+    StringBuilder builder = new StringBuilder(ROOT);
+    for (PathSegment segment : segments) {
+      if (segment instanceof PathSegment.Name) {
+        String name = ((PathSegment.Name) segment).name();
+        builder.append("['").append(rfc9535escape(name)).append("']");
+      } else if (segment instanceof PathSegment.Index) {
+        int index = ((PathSegment.Index) segment).index();
+        Preconditions.checkArgument(index >= 0, "Invalid path, negative array 
index: %s", index);
+        builder.append('[').append(index).append(']');
+      } else {
+        throw new IllegalStateException("Unknown segment: " + segment);
+      }
+    }
+    return builder.toString();
+  }
+
+  @SuppressWarnings("StatementSwitchToExpressionSwitch")
+  private static List<PathSegment> parseAfterRoot(String path) {
+    List<PathSegment> segments = Lists.newArrayList();
+    Matcher bracketMatcher = BRACKET_SEGMENT.matcher(path);
+    int len = path.length();
+    int pos = ROOT.length();
+
+    while (pos < len) {
+      char ch = path.charAt(pos);
+      switch (ch) {
+        case '.':
+          pos = appendDotSegment(path, pos, segments);
+          break;
+
+        case '[':
+          pos = appendBracketOrIndexSegment(path, pos, segments, 
bracketMatcher);
+          break;
+
+        default:
+          throw new IllegalArgumentException(
+              String.format("Invalid path, expected '.' or '[' at position %s: 
%s", pos, path));
+      }
+    }
+
+    return segments;
+  }
+
+  /** Consumes from {@code path[dotPos]} a leading {@code .} and RFC 9535 
shorthand member name. */
+  private static int appendDotSegment(String path, int dotPos, 
List<PathSegment> segments) {

Review Comment:
   nit: add javadocs of args. Personally I'd put segments first as thats the 
one which gets updated, but at least mention it in the javadoc



##########
api/src/test/java/org/apache/iceberg/expressions/TestExpressionBinding.java:
##########
@@ -350,6 +350,71 @@ public void testExtractExpressionBinding() {
     assertThat(boundExtract.type()).isEqualTo(Types.LongType.get());
   }
 
+  @Test
+  public void testVariantExtractBindingPreservesBracketInputPath() {
+    Expression bound = Binder.bind(STRUCT, lessThan(extract("var", 
"$['event_id']", "long"), 100));
+    BoundPredicate<?> pred = TestHelpers.assertAndUnwrap(bound);
+    BoundExtract<?> boundExtract = (BoundExtract<?>) pred.term();
+    assertThat(boundExtract.path()).isEqualTo("$['event_id']");
+  }
+
+  @Test
+  public void testVariantExtractBindingMixedDotAndBracketPath() {
+    Expression bound =
+        Binder.bind(STRUCT, lessThan(extract("var", "$.employee['user.name']", 
"long"), 100));
+    BoundExtract<?> boundExtract = (BoundExtract<?>) 
TestHelpers.assertAndUnwrap(bound).term();
+    assertThat(boundExtract.path()).isEqualTo("$['employee']['user.name']");
+  }
+
+  @Test
+  public void testVariantExtractBindingBracketPathWithRfc9535Unescape() {
+    // Inner segment is a'b after unescaping; normalized bracket form uses \' 
for the quote.
+    Expression bound = Binder.bind(STRUCT, lessThan(extract("var", 
"$['a\\'b']", "long"), 100));
+    BoundExtract<?> boundExtract = (BoundExtract<?>) 
TestHelpers.assertAndUnwrap(bound).term();
+    assertThat(boundExtract.path()).isEqualTo("$['a\\'b']");
+  }
+
+  @Test
+  public void testVariantExtractBindingBracketPathWithUnicodeEscape() {
+    Expression bound = Binder.bind(STRUCT, lessThan(extract("var", 
"$['\\u00e9']", "long"), 100));
+    BoundExtract<?> boundExtract = (BoundExtract<?>) 
TestHelpers.assertAndUnwrap(bound).term();
+    assertThat(boundExtract.path()).isEqualTo("$['é']");
+  }
+
+  @Test
+  public void 
testVariantExtractUnbindPreservesNormalizedPathForDotInSegmentName() {
+    Expression boundExpr = lessThan(extract("var", "$['a.b']", "long"), 
100L).bind(STRUCT, true);

Review Comment:
   I'd factor out the repetition here so that you have one method
   
   `assertExtracted(in, out)` and use it across multiple tests



##########
api/src/main/java/org/apache/iceberg/expressions/PathUtil.java:
##########
@@ -27,60 +27,255 @@
 import java.util.stream.Collectors;
 import 
org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting;
 import org.apache.iceberg.relocated.com.google.common.base.Preconditions;
-import org.apache.iceberg.relocated.com.google.common.base.Splitter;
 import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap;
+import org.apache.iceberg.relocated.com.google.common.collect.Lists;
 import org.apache.iceberg.relocated.com.google.common.collect.Streams;
 
 public class PathUtil {
   private PathUtil() {}
 
+  /**
+   * One step in a variant JSONPath: an object member name or a zero-based 
array index (RFC 9535
+   * {@code [n]} selector).
+   */
+  sealed interface PathSegment permits PathSegment.Name, PathSegment.Index {
+    record Name(String name) implements PathSegment {}
+
+    record Index(int index) implements PathSegment {}
+  }
+
   private static final String RFC9535_NAME_FIRST =
       "[A-Za-z_\\x{0080}-\\x{D7FF}\\x{E000}-\\x{10FFFF}]";
   private static final String RFC9535_NAME_CHARS =
       "[0-9A-Za-z_\\x{0080}-\\x{D7FF}\\x{E000}-\\x{10FFFF}]*";
   private static final Predicate<String> RFC9535_MEMBER_NAME_SHORTHAND =
       Pattern.compile(RFC9535_NAME_FIRST + 
RFC9535_NAME_CHARS).asMatchPredicate();
 
+  /** Letters that follow {@code \} for control-character escapes in RFC 9535 
quoted segments. */
+  private static final String RFC9535_SIMPLE_ESCAPE_LETTERS = "btnfr";
+
+  private static final String RFC9535_SIMPLE_ESCAPE_CHARS = "\b\t\n\f\r";
+
   private static final Pattern RFC9535_REQUIRES_ESCAPE =
       Pattern.compile(
           
"[^\\x{0020}-\\x{0026}\\x{0028}-\\x{005B}\\x{005D}-\\x{D7FF}\\x{E000}-\\x{10FFFF}]");
 
+  /**
+   * Matches one bracket segment {@code ['...']} where inner text may contain 
RFC 9535 escapes
+   * (quote, backslash, control characters, and four-digit hex escapes).
+   */
+  private static final Pattern BRACKET_SEGMENT = 
Pattern.compile("\\['((?:[^'\\\\]|\\\\.)*)'\\]");
+
   private static final Map<Character, String> RFC9535_ESCAPE_REPLACEMENTS = 
buildReplacementMap();
 
-  private static final Splitter DOT = Splitter.on(".");
   private static final String ROOT = "$";
 
-  static List<String> parse(String path) {
+  /**
+   * Parses a path into segments. After the root {@code $}, each segment is 
either dot shorthand
+   * ({@code .name} per RFC 9535), a single-quoted bracket name ({@code 
['...']}) with RFC 9535
+   * escapes, or a numeric array index ({@code [n]}). Forms may be mixed (e.g. 
{@code $.a['b.c']},
+   * {@code $.items[0].tags}, {@code $.matrix[0][1]}). Wildcards and recursive 
descent are not
+   * supported.
+   *
+   * <p>The root path {@code $} yields an empty segment list.
+   */
+  static List<PathSegment> parse(String path) {
     Preconditions.checkArgument(path != null, "Invalid path: null");
+    Preconditions.checkArgument(!path.isEmpty(), "Invalid path: empty");
+    Preconditions.checkArgument(
+        path.startsWith(ROOT), "Invalid path, does not start with %s: %s", 
ROOT, path);
+
+    if (path.equals(ROOT)) {
+      return Lists.newArrayList();
+    }
+
+    return parseAfterRoot(path);
+  }
+
+  /** Normalizes object field names only (no array indices). */
+  public static String toNormalizedPath(Iterable<String> fields) {
+    return toNormalizedPath(
+        
Streams.stream(fields).map(PathSegment.Name::new).collect(Collectors.toList()));
+  }
+
+  static String toNormalizedPath(List<PathSegment> segments) {
+    StringBuilder builder = new StringBuilder(ROOT);
+    for (PathSegment segment : segments) {
+      if (segment instanceof PathSegment.Name) {
+        String name = ((PathSegment.Name) segment).name();
+        builder.append("['").append(rfc9535escape(name)).append("']");
+      } else if (segment instanceof PathSegment.Index) {
+        int index = ((PathSegment.Index) segment).index();
+        Preconditions.checkArgument(index >= 0, "Invalid path, negative array 
index: %s", index);
+        builder.append('[').append(index).append(']');
+      } else {
+        throw new IllegalStateException("Unknown segment: " + segment);
+      }
+    }
+    return builder.toString();
+  }
+
+  @SuppressWarnings("StatementSwitchToExpressionSwitch")
+  private static List<PathSegment> parseAfterRoot(String path) {
+    List<PathSegment> segments = Lists.newArrayList();
+    Matcher bracketMatcher = BRACKET_SEGMENT.matcher(path);
+    int len = path.length();
+    int pos = ROOT.length();
+
+    while (pos < len) {
+      char ch = path.charAt(pos);
+      switch (ch) {
+        case '.':
+          pos = appendDotSegment(path, pos, segments);
+          break;
+
+        case '[':
+          pos = appendBracketOrIndexSegment(path, pos, segments, 
bracketMatcher);
+          break;
+
+        default:
+          throw new IllegalArgumentException(
+              String.format("Invalid path, expected '.' or '[' at position %s: 
%s", pos, path));
+      }
+    }
+
+    return segments;
+  }
+
+  /** Consumes from {@code path[dotPos]} a leading {@code .} and RFC 9535 
shorthand member name. */
+  private static int appendDotSegment(String path, int dotPos, 
List<PathSegment> segments) {
+    int pos = dotPos + 1;
+    int len = path.length();
+    Preconditions.checkArgument(pos < len, "Invalid path, trailing dot: %s", 
path);
+    int start = pos;
+    while (pos < len) {
+      char ch = path.charAt(pos);
+      if (ch == '.' || ch == '[') {
+        break;
+      }
+      pos += 1;
+    }
+
+    Preconditions.checkArgument(pos > start, "Invalid path, empty segment 
after '.': %s", path);
+    String name = path.substring(start, pos);
     Preconditions.checkArgument(
-        !path.contains("[") && !path.contains("]"), "Unsupported path, 
contains bracket: %s", path);
+        RFC9535_MEMBER_NAME_SHORTHAND.test(name),
+        "Invalid path: %s (%s has invalid characters)",
+        path,
+        name);
+    segments.add(new PathSegment.Name(name));
+    return pos;
+  }
+
+  /**
+   * Consumes either a numeric array index {@code [n]} or a quoted name {@code 
['...']} starting at
+   * {@code path[bracketPos]}.
+   */
+  private static int appendBracketOrIndexSegment(
+      String path, int bracketPos, List<PathSegment> segments, Matcher 
bracketMatcher) {
     Preconditions.checkArgument(
-        !path.contains("*"), "Unsupported path, contains wildcard: %s", path);
+        bracketPos < path.length() && path.charAt(bracketPos) == '[', "Invalid 
path: %s", path);
+    if (bracketPos + 1 < path.length() && isAsciiDigit(path.charAt(bracketPos 
+ 1))) {
+      return appendArrayIndexSegment(path, bracketPos, segments);
+    }
+    return appendQuotedBracketSegment(path, bracketPos, segments, 
bracketMatcher);
+  }
+
+  private static boolean isAsciiDigit(char ch) {
+    return ch >= '0' && ch <= '9';
+  }
+
+  private static int appendArrayIndexSegment(
+      String path, int bracketPos, List<PathSegment> segments) {
+    int pos = bracketPos + 1;
+    int len = path.length();
+    int start = pos;
+    while (pos < len && isAsciiDigit(path.charAt(pos))) {
+      pos += 1;
+    }
+    Preconditions.checkArgument(pos > start, "Invalid path, empty array index 
in: %s", path);
     Preconditions.checkArgument(
-        !path.contains(".."), "Unsupported path, contains recursive descent: 
%s", path);
+        pos < len && path.charAt(pos) == ']', "Invalid path, unclosed array 
index in: %s", path);
+    int index = Integer.parseInt(path.substring(start, pos));
+    Preconditions.checkArgument(index >= 0, "Invalid path, negative array 
index in: %s", path);
+    segments.add(new PathSegment.Index(index));
+    return pos + 1;
+  }
 
-    List<String> parts = DOT.splitToList(path);
+  private static int appendQuotedBracketSegment(

Review Comment:
   for these and the others I found it a bit hard to follow. Could the javadocs 
explain what segments they expect. And maye move that List<PathSegment> 
segments to be at the start of every method as it's what gets updated



##########
api/src/main/java/org/apache/iceberg/expressions/PathUtil.java:
##########
@@ -27,60 +27,255 @@
 import java.util.stream.Collectors;
 import 
org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting;
 import org.apache.iceberg.relocated.com.google.common.base.Preconditions;
-import org.apache.iceberg.relocated.com.google.common.base.Splitter;
 import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap;
+import org.apache.iceberg.relocated.com.google.common.collect.Lists;
 import org.apache.iceberg.relocated.com.google.common.collect.Streams;
 
 public class PathUtil {
   private PathUtil() {}
 
+  /**
+   * One step in a variant JSONPath: an object member name or a zero-based 
array index (RFC 9535
+   * {@code [n]} selector).
+   */
+  sealed interface PathSegment permits PathSegment.Name, PathSegment.Index {
+    record Name(String name) implements PathSegment {}
+
+    record Index(int index) implements PathSegment {}
+  }
+
   private static final String RFC9535_NAME_FIRST =
       "[A-Za-z_\\x{0080}-\\x{D7FF}\\x{E000}-\\x{10FFFF}]";
   private static final String RFC9535_NAME_CHARS =
       "[0-9A-Za-z_\\x{0080}-\\x{D7FF}\\x{E000}-\\x{10FFFF}]*";
   private static final Predicate<String> RFC9535_MEMBER_NAME_SHORTHAND =
       Pattern.compile(RFC9535_NAME_FIRST + 
RFC9535_NAME_CHARS).asMatchPredicate();
 
+  /** Letters that follow {@code \} for control-character escapes in RFC 9535 
quoted segments. */
+  private static final String RFC9535_SIMPLE_ESCAPE_LETTERS = "btnfr";
+
+  private static final String RFC9535_SIMPLE_ESCAPE_CHARS = "\b\t\n\f\r";
+
   private static final Pattern RFC9535_REQUIRES_ESCAPE =
       Pattern.compile(
           
"[^\\x{0020}-\\x{0026}\\x{0028}-\\x{005B}\\x{005D}-\\x{D7FF}\\x{E000}-\\x{10FFFF}]");
 
+  /**
+   * Matches one bracket segment {@code ['...']} where inner text may contain 
RFC 9535 escapes
+   * (quote, backslash, control characters, and four-digit hex escapes).
+   */
+  private static final Pattern BRACKET_SEGMENT = 
Pattern.compile("\\['((?:[^'\\\\]|\\\\.)*)'\\]");
+
   private static final Map<Character, String> RFC9535_ESCAPE_REPLACEMENTS = 
buildReplacementMap();
 
-  private static final Splitter DOT = Splitter.on(".");
   private static final String ROOT = "$";
 
-  static List<String> parse(String path) {
+  /**
+   * Parses a path into segments. After the root {@code $}, each segment is 
either dot shorthand
+   * ({@code .name} per RFC 9535), a single-quoted bracket name ({@code 
['...']}) with RFC 9535
+   * escapes, or a numeric array index ({@code [n]}). Forms may be mixed (e.g. 
{@code $.a['b.c']},
+   * {@code $.items[0].tags}, {@code $.matrix[0][1]}). Wildcards and recursive 
descent are not
+   * supported.
+   *
+   * <p>The root path {@code $} yields an empty segment list.
+   */
+  static List<PathSegment> parse(String path) {
     Preconditions.checkArgument(path != null, "Invalid path: null");
+    Preconditions.checkArgument(!path.isEmpty(), "Invalid path: empty");
+    Preconditions.checkArgument(
+        path.startsWith(ROOT), "Invalid path, does not start with %s: %s", 
ROOT, path);
+
+    if (path.equals(ROOT)) {
+      return Lists.newArrayList();
+    }
+
+    return parseAfterRoot(path);
+  }
+
+  /** Normalizes object field names only (no array indices). */
+  public static String toNormalizedPath(Iterable<String> fields) {
+    return toNormalizedPath(
+        
Streams.stream(fields).map(PathSegment.Name::new).collect(Collectors.toList()));
+  }
+
+  static String toNormalizedPath(List<PathSegment> segments) {
+    StringBuilder builder = new StringBuilder(ROOT);
+    for (PathSegment segment : segments) {
+      if (segment instanceof PathSegment.Name) {
+        String name = ((PathSegment.Name) segment).name();
+        builder.append("['").append(rfc9535escape(name)).append("']");
+      } else if (segment instanceof PathSegment.Index) {
+        int index = ((PathSegment.Index) segment).index();
+        Preconditions.checkArgument(index >= 0, "Invalid path, negative array 
index: %s", index);
+        builder.append('[').append(index).append(']');
+      } else {
+        throw new IllegalStateException("Unknown segment: " + segment);
+      }
+    }
+    return builder.toString();
+  }
+
+  @SuppressWarnings("StatementSwitchToExpressionSwitch")
+  private static List<PathSegment> parseAfterRoot(String path) {
+    List<PathSegment> segments = Lists.newArrayList();
+    Matcher bracketMatcher = BRACKET_SEGMENT.matcher(path);
+    int len = path.length();
+    int pos = ROOT.length();
+
+    while (pos < len) {
+      char ch = path.charAt(pos);
+      switch (ch) {
+        case '.':
+          pos = appendDotSegment(path, pos, segments);
+          break;
+
+        case '[':
+          pos = appendBracketOrIndexSegment(path, pos, segments, 
bracketMatcher);
+          break;
+
+        default:
+          throw new IllegalArgumentException(
+              String.format("Invalid path, expected '.' or '[' at position %s: 
%s", pos, path));
+      }
+    }
+
+    return segments;
+  }
+
+  /** Consumes from {@code path[dotPos]} a leading {@code .} and RFC 9535 
shorthand member name. */
+  private static int appendDotSegment(String path, int dotPos, 
List<PathSegment> segments) {
+    int pos = dotPos + 1;
+    int len = path.length();
+    Preconditions.checkArgument(pos < len, "Invalid path, trailing dot: %s", 
path);
+    int start = pos;
+    while (pos < len) {
+      char ch = path.charAt(pos);
+      if (ch == '.' || ch == '[') {
+        break;
+      }
+      pos += 1;
+    }
+
+    Preconditions.checkArgument(pos > start, "Invalid path, empty segment 
after '.': %s", path);
+    String name = path.substring(start, pos);
     Preconditions.checkArgument(
-        !path.contains("[") && !path.contains("]"), "Unsupported path, 
contains bracket: %s", path);
+        RFC9535_MEMBER_NAME_SHORTHAND.test(name),
+        "Invalid path: %s (%s has invalid characters)",
+        path,
+        name);
+    segments.add(new PathSegment.Name(name));
+    return pos;
+  }
+
+  /**
+   * Consumes either a numeric array index {@code [n]} or a quoted name {@code 
['...']} starting at
+   * {@code path[bracketPos]}.
+   */
+  private static int appendBracketOrIndexSegment(
+      String path, int bracketPos, List<PathSegment> segments, Matcher 
bracketMatcher) {
     Preconditions.checkArgument(
-        !path.contains("*"), "Unsupported path, contains wildcard: %s", path);
+        bracketPos < path.length() && path.charAt(bracketPos) == '[', "Invalid 
path: %s", path);
+    if (bracketPos + 1 < path.length() && isAsciiDigit(path.charAt(bracketPos 
+ 1))) {
+      return appendArrayIndexSegment(path, bracketPos, segments);
+    }
+    return appendQuotedBracketSegment(path, bracketPos, segments, 
bracketMatcher);

Review Comment:
   how does this handle the situation where the string is "["?
   



##########
api/src/main/java/org/apache/iceberg/expressions/PathUtil.java:
##########
@@ -27,60 +27,255 @@
 import java.util.stream.Collectors;
 import 
org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting;
 import org.apache.iceberg.relocated.com.google.common.base.Preconditions;
-import org.apache.iceberg.relocated.com.google.common.base.Splitter;
 import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap;
+import org.apache.iceberg.relocated.com.google.common.collect.Lists;
 import org.apache.iceberg.relocated.com.google.common.collect.Streams;
 
 public class PathUtil {
   private PathUtil() {}
 
+  /**
+   * One step in a variant JSONPath: an object member name or a zero-based 
array index (RFC 9535
+   * {@code [n]} selector).
+   */
+  sealed interface PathSegment permits PathSegment.Name, PathSegment.Index {
+    record Name(String name) implements PathSegment {}
+
+    record Index(int index) implements PathSegment {}
+  }
+
   private static final String RFC9535_NAME_FIRST =
       "[A-Za-z_\\x{0080}-\\x{D7FF}\\x{E000}-\\x{10FFFF}]";
   private static final String RFC9535_NAME_CHARS =
       "[0-9A-Za-z_\\x{0080}-\\x{D7FF}\\x{E000}-\\x{10FFFF}]*";
   private static final Predicate<String> RFC9535_MEMBER_NAME_SHORTHAND =
       Pattern.compile(RFC9535_NAME_FIRST + 
RFC9535_NAME_CHARS).asMatchPredicate();
 
+  /** Letters that follow {@code \} for control-character escapes in RFC 9535 
quoted segments. */
+  private static final String RFC9535_SIMPLE_ESCAPE_LETTERS = "btnfr";
+
+  private static final String RFC9535_SIMPLE_ESCAPE_CHARS = "\b\t\n\f\r";
+
   private static final Pattern RFC9535_REQUIRES_ESCAPE =
       Pattern.compile(
           
"[^\\x{0020}-\\x{0026}\\x{0028}-\\x{005B}\\x{005D}-\\x{D7FF}\\x{E000}-\\x{10FFFF}]");
 
+  /**
+   * Matches one bracket segment {@code ['...']} where inner text may contain 
RFC 9535 escapes
+   * (quote, backslash, control characters, and four-digit hex escapes).
+   */
+  private static final Pattern BRACKET_SEGMENT = 
Pattern.compile("\\['((?:[^'\\\\]|\\\\.)*)'\\]");
+
   private static final Map<Character, String> RFC9535_ESCAPE_REPLACEMENTS = 
buildReplacementMap();
 
-  private static final Splitter DOT = Splitter.on(".");
   private static final String ROOT = "$";
 
-  static List<String> parse(String path) {
+  /**
+   * Parses a path into segments. After the root {@code $}, each segment is 
either dot shorthand
+   * ({@code .name} per RFC 9535), a single-quoted bracket name ({@code 
['...']}) with RFC 9535
+   * escapes, or a numeric array index ({@code [n]}). Forms may be mixed (e.g. 
{@code $.a['b.c']},
+   * {@code $.items[0].tags}, {@code $.matrix[0][1]}). Wildcards and recursive 
descent are not
+   * supported.
+   *
+   * <p>The root path {@code $} yields an empty segment list.
+   */
+  static List<PathSegment> parse(String path) {
     Preconditions.checkArgument(path != null, "Invalid path: null");
+    Preconditions.checkArgument(!path.isEmpty(), "Invalid path: empty");
+    Preconditions.checkArgument(
+        path.startsWith(ROOT), "Invalid path, does not start with %s: %s", 
ROOT, path);
+
+    if (path.equals(ROOT)) {
+      return Lists.newArrayList();
+    }
+
+    return parseAfterRoot(path);
+  }
+
+  /** Normalizes object field names only (no array indices). */
+  public static String toNormalizedPath(Iterable<String> fields) {
+    return toNormalizedPath(
+        
Streams.stream(fields).map(PathSegment.Name::new).collect(Collectors.toList()));
+  }
+
+  static String toNormalizedPath(List<PathSegment> segments) {
+    StringBuilder builder = new StringBuilder(ROOT);
+    for (PathSegment segment : segments) {
+      if (segment instanceof PathSegment.Name) {
+        String name = ((PathSegment.Name) segment).name();
+        builder.append("['").append(rfc9535escape(name)).append("']");
+      } else if (segment instanceof PathSegment.Index) {
+        int index = ((PathSegment.Index) segment).index();
+        Preconditions.checkArgument(index >= 0, "Invalid path, negative array 
index: %s", index);
+        builder.append('[').append(index).append(']');
+      } else {
+        throw new IllegalStateException("Unknown segment: " + segment);
+      }
+    }
+    return builder.toString();
+  }
+
+  @SuppressWarnings("StatementSwitchToExpressionSwitch")
+  private static List<PathSegment> parseAfterRoot(String path) {
+    List<PathSegment> segments = Lists.newArrayList();
+    Matcher bracketMatcher = BRACKET_SEGMENT.matcher(path);
+    int len = path.length();
+    int pos = ROOT.length();
+
+    while (pos < len) {
+      char ch = path.charAt(pos);
+      switch (ch) {
+        case '.':
+          pos = appendDotSegment(path, pos, segments);
+          break;
+
+        case '[':
+          pos = appendBracketOrIndexSegment(path, pos, segments, 
bracketMatcher);
+          break;
+
+        default:
+          throw new IllegalArgumentException(
+              String.format("Invalid path, expected '.' or '[' at position %s: 
%s", pos, path));
+      }
+    }
+
+    return segments;
+  }
+
+  /** Consumes from {@code path[dotPos]} a leading {@code .} and RFC 9535 
shorthand member name. */
+  private static int appendDotSegment(String path, int dotPos, 
List<PathSegment> segments) {
+    int pos = dotPos + 1;
+    int len = path.length();
+    Preconditions.checkArgument(pos < len, "Invalid path, trailing dot: %s", 
path);
+    int start = pos;
+    while (pos < len) {
+      char ch = path.charAt(pos);
+      if (ch == '.' || ch == '[') {
+        break;
+      }
+      pos += 1;

Review Comment:
   pos++;
   
   you've been writing scala, haven't you?



##########
api/src/main/java/org/apache/iceberg/expressions/PathUtil.java:
##########
@@ -27,60 +27,255 @@
 import java.util.stream.Collectors;
 import 
org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting;
 import org.apache.iceberg.relocated.com.google.common.base.Preconditions;
-import org.apache.iceberg.relocated.com.google.common.base.Splitter;
 import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap;
+import org.apache.iceberg.relocated.com.google.common.collect.Lists;
 import org.apache.iceberg.relocated.com.google.common.collect.Streams;
 
 public class PathUtil {
   private PathUtil() {}
 
+  /**
+   * One step in a variant JSONPath: an object member name or a zero-based 
array index (RFC 9535
+   * {@code [n]} selector).
+   */
+  sealed interface PathSegment permits PathSegment.Name, PathSegment.Index {

Review Comment:
   I'm going to have to play with these java17 features; reminds me a lot of 
standard ML etc. We don't get the full switching on them until Java21, do we?



##########
api/src/test/java/org/apache/iceberg/expressions/TestExpressionBinding.java:
##########
@@ -378,15 +450,13 @@ public void testExtractExpressionBindingPaths(String 
path) {
         null,
         "",
         "event_id", // missing root

Review Comment:
   so you've cut some valid paths here. where do they get tested and why the 
move? is they return a different type now?



##########
api/src/test/java/org/apache/iceberg/expressions/TestPathUtil.java:
##########
@@ -123,4 +142,157 @@ public void testNormalizedFieldLists(List<String> fields, 
String normalizedPath)
   public void testPathEscaping(String name, String escaped) {
     assertThat(PathUtil.rfc9535escape(name)).isEqualTo(escaped);
   }
+
+  @ParameterizedTest
+  @FieldSource("ESCAPE_CASES")
+  public void testPathUnescapeRoundTrip(String name, String escaped) {
+    assertThat(PathUtil.rfc9535unescape(escaped)).isEqualTo(name);
+  }
+
+  @ParameterizedTest
+  @FieldSource("NORMALIZED_PATHS")
+  public void testParseDotAndBracketAgree(String dotPath, String 
normalizedPath) {
+    
assertThat(PathUtil.parse(dotPath)).isEqualTo(PathUtil.parse(normalizedPath));
+  }
+
+  @Test
+  public void testParseSingleSegmentWithDot() {
+    assertThat(PathUtil.parse("$['a.b']")).isEqualTo(names("a.b"));
+    assertThat(PathUtil.parse("$['user.name']")).isEqualTo(names("user.name"));
+  }
+
+  @Test
+  public void testParseDistinctFromDotSplit() {
+    assertThat(PathUtil.parse("$.a.b")).isEqualTo(names("a", "b"));
+    assertThat(PathUtil.parse("$['a.b']")).isEqualTo(names("a.b"));
+  }
+
+  @Test
+  public void testParseNormalizedRoundTrip() {
+    for (Object[] row : NORMALIZED_FIELD_LISTS) {
+      @SuppressWarnings("unchecked")
+      List<String> fields = (List<String>) row[0];
+      String normalized = (String) row[1];
+      
assertThat(PathUtil.toNormalizedPath(PathUtil.parse(normalized))).isEqualTo(normalized);
+      assertThat(PathUtil.parse(normalized))
+          
.isEqualTo(fields.stream().map(Name::new).collect(Collectors.toList()));
+    }
+  }
+
+  @Test
+  public void testParseRoot() {
+    assertThat(PathUtil.parse("$")).isEqualTo(List.of());
+    assertThat(PathUtil.toNormalizedPath(PathUtil.parse("$"))).isEqualTo("$");
+  }
+
+  /**
+   * Bracket paths can represent keys that dot notation cannot (empty name, 
{@code []}, {@code ..}).
+   */
+  @Test
+  public void testParseSpecialFieldNames() {
+    assertThat(PathUtil.parse("$['']")).isEqualTo(names(""));
+    assertThat(PathUtil.parse("$['[]']")).isEqualTo(names("[]"));
+    assertThat(PathUtil.parse("$['..']")).isEqualTo(names(".."));
+    assertThat(PathUtil.parse("$['*']")).isEqualTo(names("*"));
+    assertThat(PathUtil.parse("$['[']")).isEqualTo(names("["));
+    assertThat(PathUtil.parse("$[']']")).isEqualTo(names("]"));
+
+    
assertThat(PathUtil.toNormalizedPath(PathUtil.parse("$['']"))).isEqualTo("$['']");
+    
assertThat(PathUtil.toNormalizedPath(PathUtil.parse("$['[]']"))).isEqualTo("$['[]']");
+    
assertThat(PathUtil.toNormalizedPath(PathUtil.parse("$['..']"))).isEqualTo("$['..']");
+  }
+
+  @Test
+  public void testParseNestedWithSpecialMiddleSegment() {
+    assertThat(PathUtil.parse("$['a']['..']['b']")).isEqualTo(names("a", "..", 
"b"));
+    assertThat(PathUtil.toNormalizedPath(List.of("a", "..", 
"b"))).isEqualTo("$['a']['..']['b']");
+  }
+
+  @Test
+  public void testParseMixedDotAndBracket() {
+    assertThat(PathUtil.parse("$.a['b.c']")).isEqualTo(names("a", "b.c"));
+    assertThat(PathUtil.parse("$.events['user.name'].id"))
+        .isEqualTo(names("events", "user.name", "id"));
+    
assertThat(PathUtil.toNormalizedPath(PathUtil.parse("$.a['b.c']"))).isEqualTo("$['a']['b.c']");
+  }
+
+  @Test
+  public void testParseMixedBracketThenDot() {
+    assertThat(PathUtil.parse("$['x.y'].z")).isEqualTo(names("x.y", "z"));
+    
assertThat(PathUtil.toNormalizedPath(PathUtil.parse("$['x.y'].z"))).isEqualTo("$['x.y']['z']");
+  }
+
+  @Test
+  public void testParseArrayIndexSegments() {
+    assertThat(PathUtil.parse("$.matrix[0][1]"))
+        .isEqualTo(List.of(new Name("matrix"), new Index(0), new Index(1)));
+    assertThat(PathUtil.toNormalizedPath(PathUtil.parse("$.matrix[0][1]")))
+        .isEqualTo("$['matrix'][0][1]");
+
+    assertThat(PathUtil.parse("$.basket[0][2].a"))
+        .isEqualTo(List.of(new Name("basket"), new Index(0), new Index(2), new 
Name("a")));
+    assertThat(PathUtil.toNormalizedPath(PathUtil.parse("$.basket[0][2].a")))
+        .isEqualTo("$['basket'][0][2]['a']");
+
+    assertThat(PathUtil.parse("$.items[0].tags[1]"))
+        .isEqualTo(List.of(new Name("items"), new Index(0), new Name("tags"), 
new Index(1)));
+    assertThat(PathUtil.toNormalizedPath(PathUtil.parse("$.items[0].tags[1]")))
+        .isEqualTo("$['items'][0]['tags'][1]");
+
+    assertThat(PathUtil.parse("$.events[0].event_id"))
+        .isEqualTo(List.of(new Name("events"), new Index(0), new 
Name("event_id")));
+    
assertThat(PathUtil.toNormalizedPath(PathUtil.parse("$.events[0].event_id")))
+        .isEqualTo("$['events'][0]['event_id']");
+
+    assertThat(PathUtil.parse("$['items'][0]['tags'][1]"))
+        .isEqualTo(PathUtil.parse("$.items[0].tags[1]"));
+    
assertThat(PathUtil.parse("$['matrix'][0][1]")).isEqualTo(PathUtil.parse("$.matrix[0][1]"));
+  }
+
+  @Test
+  public void testParseArrayIndexRoundTrip() {
+    // parse(toNormalizedPath(parse(x))) == parse(x) for paths with [n] 
segments
+    for (String[] pair : NORMALIZED_PATHS) {
+      String normalized = pair[1];
+      if (normalized.contains("[") && !normalized.contains("['")) {
+        // normalized path has array indices; round-trip must be stable
+        
assertThat(PathUtil.parse(PathUtil.toNormalizedPath(PathUtil.parse(normalized))))
+            .isEqualTo(PathUtil.parse(normalized));
+      }
+    }
+    // explicit cases to be clear about what is covered
+    for (String path :
+        new String[] {
+          "$['matrix'][0][1]",
+          "$['items'][0]['tags'][1]",
+          "$['basket'][0][2]['a']",
+          "$['events'][0]['event_id']",
+        }) {
+      
assertThat(PathUtil.parse(PathUtil.toNormalizedPath(PathUtil.parse(path))))
+          .isEqualTo(PathUtil.parse(path));
+    }
+  }
+
+  private static final String[] INVALID_PARSE_PATHS =
+      new String[] {
+        null,
+        "",
+        "event_id",
+        "$.a..b",
+        "$.events.*",
+        "$['a'", // unclosed
+        "$['a']x", // trailing junk
+        "$.['a']", // empty dot segment before bracket
+        "$a.b", // missing separator after $
+        "$.a[]", // empty array index
+        "$.a[-1]", // negative index (not valid JSONPath index syntax here)

Review Comment:
   add
   ```
   $[
   $$
   ```
   



##########
api/src/main/java/org/apache/iceberg/expressions/PathUtil.java:
##########
@@ -27,60 +27,255 @@
 import java.util.stream.Collectors;
 import 
org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting;
 import org.apache.iceberg.relocated.com.google.common.base.Preconditions;
-import org.apache.iceberg.relocated.com.google.common.base.Splitter;
 import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap;
+import org.apache.iceberg.relocated.com.google.common.collect.Lists;
 import org.apache.iceberg.relocated.com.google.common.collect.Streams;
 
 public class PathUtil {
   private PathUtil() {}
 
+  /**
+   * One step in a variant JSONPath: an object member name or a zero-based 
array index (RFC 9535
+   * {@code [n]} selector).
+   */
+  sealed interface PathSegment permits PathSegment.Name, PathSegment.Index {
+    record Name(String name) implements PathSegment {}
+
+    record Index(int index) implements PathSegment {}
+  }
+
   private static final String RFC9535_NAME_FIRST =
       "[A-Za-z_\\x{0080}-\\x{D7FF}\\x{E000}-\\x{10FFFF}]";
   private static final String RFC9535_NAME_CHARS =
       "[0-9A-Za-z_\\x{0080}-\\x{D7FF}\\x{E000}-\\x{10FFFF}]*";
   private static final Predicate<String> RFC9535_MEMBER_NAME_SHORTHAND =
       Pattern.compile(RFC9535_NAME_FIRST + 
RFC9535_NAME_CHARS).asMatchPredicate();
 
+  /** Letters that follow {@code \} for control-character escapes in RFC 9535 
quoted segments. */
+  private static final String RFC9535_SIMPLE_ESCAPE_LETTERS = "btnfr";
+
+  private static final String RFC9535_SIMPLE_ESCAPE_CHARS = "\b\t\n\f\r";

Review Comment:
   had to look \f up. never seen it or used it. But if it's in the spec, it 
needs coverage



##########
api/src/test/java/org/apache/iceberg/expressions/TestPathUtil.java:
##########
@@ -123,4 +142,157 @@ public void testNormalizedFieldLists(List<String> fields, 
String normalizedPath)
   public void testPathEscaping(String name, String escaped) {
     assertThat(PathUtil.rfc9535escape(name)).isEqualTo(escaped);
   }
+
+  @ParameterizedTest
+  @FieldSource("ESCAPE_CASES")
+  public void testPathUnescapeRoundTrip(String name, String escaped) {
+    assertThat(PathUtil.rfc9535unescape(escaped)).isEqualTo(name);
+  }
+
+  @ParameterizedTest
+  @FieldSource("NORMALIZED_PATHS")
+  public void testParseDotAndBracketAgree(String dotPath, String 
normalizedPath) {
+    
assertThat(PathUtil.parse(dotPath)).isEqualTo(PathUtil.parse(normalizedPath));
+  }
+
+  @Test
+  public void testParseSingleSegmentWithDot() {
+    assertThat(PathUtil.parse("$['a.b']")).isEqualTo(names("a.b"));
+    assertThat(PathUtil.parse("$['user.name']")).isEqualTo(names("user.name"));
+  }
+
+  @Test
+  public void testParseDistinctFromDotSplit() {
+    assertThat(PathUtil.parse("$.a.b")).isEqualTo(names("a", "b"));
+    assertThat(PathUtil.parse("$['a.b']")).isEqualTo(names("a.b"));
+  }
+
+  @Test
+  public void testParseNormalizedRoundTrip() {
+    for (Object[] row : NORMALIZED_FIELD_LISTS) {
+      @SuppressWarnings("unchecked")
+      List<String> fields = (List<String>) row[0];
+      String normalized = (String) row[1];
+      
assertThat(PathUtil.toNormalizedPath(PathUtil.parse(normalized))).isEqualTo(normalized);
+      assertThat(PathUtil.parse(normalized))
+          
.isEqualTo(fields.stream().map(Name::new).collect(Collectors.toList()));
+    }
+  }
+
+  @Test
+  public void testParseRoot() {
+    assertThat(PathUtil.parse("$")).isEqualTo(List.of());
+    assertThat(PathUtil.toNormalizedPath(PathUtil.parse("$"))).isEqualTo("$");
+  }
+
+  /**
+   * Bracket paths can represent keys that dot notation cannot (empty name, 
{@code []}, {@code ..}).
+   */
+  @Test
+  public void testParseSpecialFieldNames() {
+    assertThat(PathUtil.parse("$['']")).isEqualTo(names(""));
+    assertThat(PathUtil.parse("$['[]']")).isEqualTo(names("[]"));
+    assertThat(PathUtil.parse("$['..']")).isEqualTo(names(".."));
+    assertThat(PathUtil.parse("$['*']")).isEqualTo(names("*"));
+    assertThat(PathUtil.parse("$['[']")).isEqualTo(names("["));
+    assertThat(PathUtil.parse("$[']']")).isEqualTo(names("]"));
+
+    
assertThat(PathUtil.toNormalizedPath(PathUtil.parse("$['']"))).isEqualTo("$['']");
+    
assertThat(PathUtil.toNormalizedPath(PathUtil.parse("$['[]']"))).isEqualTo("$['[]']");
+    
assertThat(PathUtil.toNormalizedPath(PathUtil.parse("$['..']"))).isEqualTo("$['..']");
+  }
+
+  @Test
+  public void testParseNestedWithSpecialMiddleSegment() {
+    assertThat(PathUtil.parse("$['a']['..']['b']")).isEqualTo(names("a", "..", 
"b"));
+    assertThat(PathUtil.toNormalizedPath(List.of("a", "..", 
"b"))).isEqualTo("$['a']['..']['b']");
+  }
+
+  @Test
+  public void testParseMixedDotAndBracket() {
+    assertThat(PathUtil.parse("$.a['b.c']")).isEqualTo(names("a", "b.c"));
+    assertThat(PathUtil.parse("$.events['user.name'].id"))
+        .isEqualTo(names("events", "user.name", "id"));
+    
assertThat(PathUtil.toNormalizedPath(PathUtil.parse("$.a['b.c']"))).isEqualTo("$['a']['b.c']");
+  }
+
+  @Test
+  public void testParseMixedBracketThenDot() {
+    assertThat(PathUtil.parse("$['x.y'].z")).isEqualTo(names("x.y", "z"));
+    
assertThat(PathUtil.toNormalizedPath(PathUtil.parse("$['x.y'].z"))).isEqualTo("$['x.y']['z']");
+  }
+
+  @Test
+  public void testParseArrayIndexSegments() {
+    assertThat(PathUtil.parse("$.matrix[0][1]"))
+        .isEqualTo(List.of(new Name("matrix"), new Index(0), new Index(1)));
+    assertThat(PathUtil.toNormalizedPath(PathUtil.parse("$.matrix[0][1]")))
+        .isEqualTo("$['matrix'][0][1]");
+
+    assertThat(PathUtil.parse("$.basket[0][2].a"))
+        .isEqualTo(List.of(new Name("basket"), new Index(0), new Index(2), new 
Name("a")));
+    assertThat(PathUtil.toNormalizedPath(PathUtil.parse("$.basket[0][2].a")))
+        .isEqualTo("$['basket'][0][2]['a']");
+
+    assertThat(PathUtil.parse("$.items[0].tags[1]"))
+        .isEqualTo(List.of(new Name("items"), new Index(0), new Name("tags"), 
new Index(1)));
+    assertThat(PathUtil.toNormalizedPath(PathUtil.parse("$.items[0].tags[1]")))
+        .isEqualTo("$['items'][0]['tags'][1]");
+
+    assertThat(PathUtil.parse("$.events[0].event_id"))
+        .isEqualTo(List.of(new Name("events"), new Index(0), new 
Name("event_id")));
+    
assertThat(PathUtil.toNormalizedPath(PathUtil.parse("$.events[0].event_id")))
+        .isEqualTo("$['events'][0]['event_id']");
+
+    assertThat(PathUtil.parse("$['items'][0]['tags'][1]"))
+        .isEqualTo(PathUtil.parse("$.items[0].tags[1]"));
+    
assertThat(PathUtil.parse("$['matrix'][0][1]")).isEqualTo(PathUtil.parse("$.matrix[0][1]"));
+  }
+
+  @Test
+  public void testParseArrayIndexRoundTrip() {
+    // parse(toNormalizedPath(parse(x))) == parse(x) for paths with [n] 
segments
+    for (String[] pair : NORMALIZED_PATHS) {
+      String normalized = pair[1];
+      if (normalized.contains("[") && !normalized.contains("['")) {
+        // normalized path has array indices; round-trip must be stable
+        
assertThat(PathUtil.parse(PathUtil.toNormalizedPath(PathUtil.parse(normalized))))
+            .isEqualTo(PathUtil.parse(normalized));
+      }
+    }
+    // explicit cases to be clear about what is covered
+    for (String path :
+        new String[] {
+          "$['matrix'][0][1]",
+          "$['items'][0]['tags'][1]",
+          "$['basket'][0][2]['a']",
+          "$['events'][0]['event_id']",
+        }) {
+      
assertThat(PathUtil.parse(PathUtil.toNormalizedPath(PathUtil.parse(path))))
+          .isEqualTo(PathUtil.parse(path));
+    }
+  }
+
+  private static final String[] INVALID_PARSE_PATHS =
+      new String[] {
+        null,
+        "",
+        "event_id",
+        "$.a..b",
+        "$.events.*",
+        "$['a'", // unclosed
+        "$['a']x", // trailing junk
+        "$.['a']", // empty dot segment before bracket
+        "$a.b", // missing separator after $
+        "$.a[]", // empty array index
+        "$.a[-1]", // negative index (not valid JSONPath index syntax here)
+      };
+
+  @ParameterizedTest
+  @FieldSource("INVALID_PARSE_PATHS")
+  public void testParseInvalid(String path) {
+    assertThatThrownBy(() -> PathUtil.parse(path))
+        .isInstanceOf(IllegalArgumentException.class)

Review Comment:
   add 
   ```
    describedAs("parse of %s", path)
   ```
   before the .isInstanceof()
   



##########
api/src/main/java/org/apache/iceberg/expressions/InclusiveMetricsEvaluator.java:
##########
@@ -632,6 +633,8 @@ private boolean isNonNullPreserving(Bound<?> term) {
   }
 
   private static VariantObject parseBounds(ByteBuffer buffer) {
-    return Variant.from(buffer).value().asObject();
+    // Explicitly use little-endian encoding for reading buffer
+    ByteBuffer littleEndian = 
buffer.duplicate().order(ByteOrder.LITTLE_ENDIAN);

Review Comment:
   I've just discovered today that Arm64 defaults to being little endian too, 
so even if the java default is from the SPARC era, little endian is what we 
need for performance on x86 and raspberry pis (oh, and macbooks, AWS Graviton 
parts...)



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] Api: Support variant extract and fix manifest bounds byte order [iceberg]

Reply via email to