This is an automated email from the ASF dual-hosted git repository.
stevenzwu pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/iceberg.git
The following commit(s) were added to refs/heads/main by this push:
new 29ba0a14dc Core: Add V4 location relativization utilities (#16174)
29ba0a14dc is described below
commit 29ba0a14dc6667db0683fdfcd520639b3da77774
Author: Anoop Johnson <[email protected]>
AuthorDate: Fri May 22 21:16:44 2026 -0700
Core: Add V4 location relativization utilities (#16174)
* Core: Add location relativization utilities to RelativePathUtil
Add isAbsolute, resolve, and relativize methods for converting
between absolute and relative file paths. These will be used by
v4 metadata to store locations relative to the table location.
---
.../java/org/apache/iceberg/util/LocationUtil.java | 76 +++++++-
.../org/apache/iceberg/util/TestLocationUtil.java | 208 +++++++++++++++++++++
2 files changed, 282 insertions(+), 2 deletions(-)
diff --git a/core/src/main/java/org/apache/iceberg/util/LocationUtil.java
b/core/src/main/java/org/apache/iceberg/util/LocationUtil.java
index 4c0d401c74..21eacbfbd6 100644
--- a/core/src/main/java/org/apache/iceberg/util/LocationUtil.java
+++ b/core/src/main/java/org/apache/iceberg/util/LocationUtil.java
@@ -24,14 +24,16 @@ import
org.apache.iceberg.relocated.com.google.common.base.Preconditions;
import org.apache.iceberg.relocated.com.google.common.base.Strings;
public class LocationUtil {
+ public static final String PATH_SEPARATOR = "/";
+
private LocationUtil() {}
public static String stripTrailingSlash(String path) {
Preconditions.checkArgument(!Strings.isNullOrEmpty(path), "path must not
be null or empty");
String result = path;
- while (!result.endsWith("://") && result.endsWith("/")) {
- result = result.substring(0, result.length() - 1);
+ while (!result.endsWith("://") && result.endsWith(PATH_SEPARATOR)) {
+ result = result.substring(0, result.length() - PATH_SEPARATOR.length());
}
return result;
}
@@ -57,4 +59,74 @@ public class LocationUtil {
return tableIdentifier.name();
}
}
+
+ /**
+ * Returns true if the location contains a URI scheme (e.g. {@code s3:},
{@code hdfs:}, {@code
+ * file:}), per <a
href="https://datatracker.ietf.org/doc/html/rfc3986#section-3.1">RFC 3986
+ * section 3.1</a>.
+ */
+ private static boolean hasScheme(String location) {
+ for (int i = 0; i < location.length(); i += 1) {
+ char ch = location.charAt(i);
+ if (ch == ':') {
+ return i > 0;
+ }
+
+ if (!isSchemeChar(ch, i)) {
+ return false;
+ }
+ }
+
+ return false;
+ }
+
+ /**
+ * Returns true if {@code ch} is allowed at {@code position} in a URI
scheme, per <a
+ * href="https://datatracker.ietf.org/doc/html/rfc3986#section-3.1">RFC 3986
section 3.1</a>:
+ * {@code scheme = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." )}.
+ */
+ private static boolean isSchemeChar(char ch, int position) {
+ if ((ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z')) {
+ return true;
+ }
+
+ return position > 0 && ((ch >= '0' && ch <= '9') || ch == '+' || ch == '-'
|| ch == '.');
+ }
+
+ /**
+ * Resolves a location against a table location. If the location has a URI
scheme, it is returned
+ * as-is. Otherwise, the table location and the relative location are joined
by the URI separator
+ * character {@code /}.
+ *
+ * <p>The separator is appended unconditionally; {@code tableLocation} is
expected not to end with
+ * {@code /} and {@code location} is expected not to start with {@code /}.
Otherwise the result
+ * will contain a duplicate {@code //}.
+ */
+ public static String resolveLocation(String tableLocation, String location) {
+ if (hasScheme(location)) {
+ return location;
+ }
+
+ return tableLocation + PATH_SEPARATOR + location;
+ }
+
+ /**
+ * Relativizes a location against a table location. If the location starts
with the table location
+ * immediately followed by the URI separator character {@code /}, the prefix
and separator are
+ * removed and the remaining relative portion is returned. Otherwise, the
location is returned
+ * as-is.
+ *
+ * <p>{@code tableLocation} is expected not to end with {@code /}. A
trailing separator on the
+ * table location will cause locations that would otherwise match to be
returned unchanged.
+ */
+ public static String relativizeLocation(String tableLocation, String
location) {
+ int prefixLength = tableLocation.length();
+ if (location.length() > prefixLength
+ && location.startsWith(PATH_SEPARATOR, prefixLength)
+ && location.startsWith(tableLocation)) {
+ return location.substring(prefixLength + PATH_SEPARATOR.length());
+ }
+
+ return location;
+ }
}
diff --git a/core/src/test/java/org/apache/iceberg/util/TestLocationUtil.java
b/core/src/test/java/org/apache/iceberg/util/TestLocationUtil.java
index 9a7b2768d9..0ce0c9ad31 100644
--- a/core/src/test/java/org/apache/iceberg/util/TestLocationUtil.java
+++ b/core/src/test/java/org/apache/iceberg/util/TestLocationUtil.java
@@ -84,4 +84,212 @@ public class TestLocationUtil {
.as("Should be root path")
.isEqualTo(rootPath);
}
+
+ @Test
+ public void testResolveRelativeLocations() {
+ String tableLocation = "s3://bucket/db/table";
+
+ assertThat(LocationUtil.resolveLocation(tableLocation,
"metadata/file.parquet"))
+ .isEqualTo("s3://bucket/db/table/metadata/file.parquet");
+
+ assertThat(LocationUtil.resolveLocation(tableLocation,
"data/00000-0.parquet"))
+ .isEqualTo("s3://bucket/db/table/data/00000-0.parquet");
+ }
+
+ @Test
+ public void testResolveLocationsWithColonsInSegments() {
+ String tableLocation = "s3://bucket/db/table";
+
+ assertThat(LocationUtil.resolveLocation(tableLocation,
"data/partition=key:value/file.parquet"))
+
.isEqualTo("s3://bucket/db/table/data/partition=key:value/file.parquet");
+
+ assertThat(LocationUtil.resolveLocation(tableLocation,
"metadata/snap-123:456.avro"))
+ .isEqualTo("s3://bucket/db/table/metadata/snap-123:456.avro");
+ }
+
+ @Test
+ public void testResolveAbsoluteLocationsUnchanged() {
+ String tableLocation = "s3://bucket/db/table";
+
+ // different scheme (from the spec example table)
+ assertThat(
+ LocationUtil.resolveLocation(tableLocation,
"hdfs://wh/db/table/data/00000-0.parquet"))
+ .isEqualTo("hdfs://wh/db/table/data/00000-0.parquet");
+
+ // different bucket
+ assertThat(
+ LocationUtil.resolveLocation(
+ tableLocation, "s3://other-bucket/db/table/data/file.parquet"))
+ .isEqualTo("s3://other-bucket/db/table/data/file.parquet");
+
+ // same bucket, different path
+ assertThat(
+ LocationUtil.resolveLocation(
+ tableLocation, "s3://bucket/db/other-table/data/file.parquet"))
+ .isEqualTo("s3://bucket/db/other-table/data/file.parquet");
+ }
+
+ @Test
+ public void testRelativize() {
+ String tableLocation = "s3://bucket/db/table";
+
+ assertThat(
+ LocationUtil.relativizeLocation(
+ tableLocation, "s3://bucket/db/table/metadata/file.parquet"))
+ .isEqualTo("metadata/file.parquet");
+
+ assertThat(
+ LocationUtil.relativizeLocation(
+ tableLocation, "s3://bucket/db/table/data/00000-0.parquet"))
+ .isEqualTo("data/00000-0.parquet");
+ }
+
+ @Test
+ public void testRelativizeLocationNotUnderTableLocation() {
+ String tableLocation = "s3://bucket/db/table";
+
+ // different bucket
+ assertThat(
+ LocationUtil.relativizeLocation(
+ tableLocation, "s3://other-bucket/db/table/data/file.parquet"))
+ .isEqualTo("s3://other-bucket/db/table/data/file.parquet");
+
+ // same bucket, different path
+ assertThat(
+ LocationUtil.relativizeLocation(
+ tableLocation, "s3://bucket/db/other-table/data/file.parquet"))
+ .isEqualTo("s3://bucket/db/other-table/data/file.parquet");
+ }
+
+ @Test
+ public void testRelativizeLocationWithSharedPrefix() {
+ // sibling locations that share a byte prefix with the table location but
are not
+ // children of it must not be relativized (e.g. "table" vs "table_v2")
+ String tableLocation = "s3://bucket/db/table";
+
+ assertThat(
+ LocationUtil.relativizeLocation(
+ tableLocation, "s3://bucket/db/table_v2/data/00000-0.parquet"))
+ .isEqualTo("s3://bucket/db/table_v2/data/00000-0.parquet");
+ }
+
+ @Test
+ public void testRelativizeLocationEqualToTableLocation() {
+ // a location equal to the table location is not followed by a separator,
+ // so it is not a child of the table location and is returned as-is
+ String tableLocation = "s3://bucket/db/table";
+
+ assertThat(LocationUtil.relativizeLocation(tableLocation, tableLocation))
+ .isEqualTo(tableLocation);
+ }
+
+ @Test
+ public void testRelativizeMismatchedFileSchemeNotRelativized() {
+ // mixed file: variants are NOT relativized. Consistent URI forms are the
caller's
+ // responsibility
+ assertThat(
+ LocationUtil.relativizeLocation(
+ "file:/tmp/table", "file:///tmp/table/metadata/file.parquet"))
+ .isEqualTo("file:///tmp/table/metadata/file.parquet");
+
+ assertThat(
+ LocationUtil.relativizeLocation(
+ "file:///tmp/table", "file:/tmp/table/metadata/file.parquet"))
+ .isEqualTo("file:/tmp/table/metadata/file.parquet");
+ }
+
+ @Test
+ public void testResolveAbsoluteLocationWithNonAlphanumericScheme() {
+ String tableLocation = "s3://bucket/db/table";
+
+ assertThat(LocationUtil.resolveLocation(tableLocation,
"git+ssh://host/repo"))
+ .isEqualTo("git+ssh://host/repo");
+ }
+
+ @Test
+ public void testResolveTreatsNonAsciiSchemeAsRelative() {
+ // RFC 3986 restricts schemes to US-ASCII; a non-ASCII letter such as the
Greek alpha
+ // (U+03B1) is not a valid scheme character and the location is treated as
relative
+ String tableLocation = "s3://bucket/db/table";
+ String location = "αscheme://host/path";
+
+ assertThat(LocationUtil.resolveLocation(tableLocation, location))
+ .isEqualTo(tableLocation + "/" + location);
+ }
+
+ @Test
+ public void testResolveTreatsNonAlphaLeadingCharAsRelative() {
+ // RFC 3986 section 3.1 requires the first scheme character to be ALPHA;
locations
+ // beginning with a digit or with '+'/'-'/'.' are treated as relative
+ String tableLocation = "s3://bucket/db/table";
+
+ assertThat(LocationUtil.resolveLocation(tableLocation, "3com://host"))
+ .isEqualTo("s3://bucket/db/table/3com://host");
+
+ assertThat(LocationUtil.resolveLocation(tableLocation, "+ssh://host"))
+ .isEqualTo("s3://bucket/db/table/+ssh://host");
+
+ assertThat(LocationUtil.resolveLocation(tableLocation, "-foo://host"))
+ .isEqualTo("s3://bucket/db/table/-foo://host");
+
+ assertThat(LocationUtil.resolveLocation(tableLocation, ".bar://host"))
+ .isEqualTo("s3://bucket/db/table/.bar://host");
+ }
+
+ @Test
+ public void testRelativizeResolveRoundTrip() {
+ String tableLocation = "s3://bucket/db/table";
+ String absoluteLocation =
"s3://bucket/db/table/metadata/root-manifest.parquet";
+
+ String relativized = LocationUtil.relativizeLocation(tableLocation,
absoluteLocation);
+ assertThat(relativized).isEqualTo("metadata/root-manifest.parquet");
+
+ String resolved = LocationUtil.resolveLocation(tableLocation, relativized);
+ assertThat(resolved).isEqualTo(absoluteLocation);
+ }
+
+ @Test
+ public void testRelativizeResolveRoundTripWithFileScheme() {
+ String tableLocation = "file:///tmp/warehouse/table";
+ String absoluteLocation =
"file:///tmp/warehouse/table/metadata/root-manifest.parquet";
+
+ String relativized = LocationUtil.relativizeLocation(tableLocation,
absoluteLocation);
+ assertThat(relativized).isEqualTo("metadata/root-manifest.parquet");
+
+ String resolved = LocationUtil.resolveLocation(tableLocation, relativized);
+ assertThat(resolved).isEqualTo(absoluteLocation);
+ }
+
+ @Test
+ public void
testResolveWithTrailingOrLeadingSlashProducesDuplicateSeparator() {
+ // the spec documents that joining a table location ending with '/' or a
relative location
+ // starting with '/' yields a duplicate '//'; callers are expected to
avoid this
+ assertThat(LocationUtil.resolveLocation("s3://bucket/db/table/",
"data/00000-0.parquet"))
+ .isEqualTo("s3://bucket/db/table//data/00000-0.parquet");
+
+ assertThat(LocationUtil.resolveLocation("s3://bucket/db/table",
"/data/00000-0.parquet"))
+ .isEqualTo("s3://bucket/db/table//data/00000-0.parquet");
+ }
+
+ @Test
+ public void testRelativizeWithTrailingSlashTableLocationNotRelativized() {
+ // a trailing '/' on the table location prevents the prefix match because
relativization
+ // expects the separator to follow the prefix; the location is returned
as-is
+ assertThat(
+ LocationUtil.relativizeLocation(
+ "s3://bucket/db/table/",
"s3://bucket/db/table/data/00000-0.parquet"))
+ .isEqualTo("s3://bucket/db/table/data/00000-0.parquet");
+ }
+
+ @Test
+ public void testRelativizeResolveRoundTripWithHDFS() {
+ String tableLocation = "hdfs://namenode/warehouse/table";
+ String absoluteLocation =
"hdfs://namenode/warehouse/table/data/00000-0.parquet";
+
+ String relativized = LocationUtil.relativizeLocation(tableLocation,
absoluteLocation);
+ assertThat(relativized).isEqualTo("data/00000-0.parquet");
+
+ String resolved = LocationUtil.resolveLocation(tableLocation, relativized);
+ assertThat(resolved).isEqualTo(absoluteLocation);
+ }
}