This is an automated email from the ASF dual-hosted git repository.
yuxia pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/fluss.git
The following commit(s) were added to refs/heads/main by this push:
new 258fb8183 [build] Update docker image dependencies for flink/fluss
iceberg quickstart (#1813)
258fb8183 is described below
commit 258fb81838fce584eee38f4df33ceb4afa52cdf3
Author: MehulBatra <[email protected]>
AuthorDate: Tue Oct 21 08:57:23 2025 +0530
[build] Update docker image dependencies for flink/fluss iceberg quickstart
(#1813)
---------
Co-authored-by: luoyuxia <[email protected]>
---
docker/quickstart-flink/prepare_build.sh | 53 ++++++++++++++++++++++++--------
fluss-lake/fluss-lake-iceberg/pom.xml | 19 ++++++++++++
website/docs/quickstart/flink-iceberg.md | 52 ++++++++++++++++---------------
3 files changed, 86 insertions(+), 38 deletions(-)
diff --git a/docker/quickstart-flink/prepare_build.sh
b/docker/quickstart-flink/prepare_build.sh
index f73d4cbf2..222cf35f3 100755
--- a/docker/quickstart-flink/prepare_build.sh
+++ b/docker/quickstart-flink/prepare_build.sh
@@ -109,6 +109,7 @@ check_prerequisites() {
local required_dirs=(
"$PROJECT_ROOT/fluss-flink/fluss-flink-1.20/target"
"$PROJECT_ROOT/fluss-lake/fluss-lake-paimon/target"
+ "$PROJECT_ROOT/fluss-lake/fluss-lake-iceberg/target"
"$PROJECT_ROOT/fluss-flink/fluss-flink-tiering/target"
)
@@ -140,6 +141,7 @@ main() {
log_info "Copying Fluss connector JARs..."
copy_jar
"$PROJECT_ROOT/fluss-flink/fluss-flink-1.20/target/fluss-flink-1.20-*.jar"
"./lib" "fluss-flink-1.20 connector"
copy_jar
"$PROJECT_ROOT/fluss-lake/fluss-lake-paimon/target/fluss-lake-paimon-*.jar"
"./lib" "fluss-lake-paimon connector"
+ copy_jar
"$PROJECT_ROOT/fluss-lake/fluss-lake-iceberg/target/fluss-lake-iceberg-*.jar"
"./lib" "fluss-lake-iceberg connector"
# Download external dependencies
log_info "Downloading external dependencies..."
@@ -151,12 +153,12 @@ main() {
"" \
"flink-faker-0.5.3"
- # Download flink-shaded-hadoop-2-uber for Hadoop integration
+ # Download Hadoop for HDFS/local filesystem support
download_jar \
"https://repo1.maven.org/maven2/io/trino/hadoop/hadoop-apache/3.3.5-2/hadoop-apache-3.3.5-2.jar"
\
"./lib/hadoop-apache-3.3.5-2.jar" \
"508255883b984483a45ca48d5af6365d4f013bb8" \
- "hadoop-apache-3.3.5-2.jar"
+ "hadoop-apache-3.3.5-2"
# Download paimon-flink connector
download_jar \
@@ -165,13 +167,24 @@ main() {
"b9f8762c6e575f6786f1d156a18d51682ffc975c" \
"paimon-flink-1.20-1.2.0"
+ # Iceberg Support
+ log_info "Downloading Iceberg connector JARs..."
+
+ # Download iceberg-flink-runtime for Flink 1.20 (version 1.9.1)
+ download_jar \
+
"https://repo1.maven.org/maven2/org/apache/iceberg/iceberg-flink-runtime-1.20/1.9.1/iceberg-flink-runtime-1.20-1.9.1.jar"
\
+ "./lib/iceberg-flink-runtime-1.20-1.9.1.jar" \
+ "" \
+ "iceberg-flink-runtime-1.20-1.9.1"
+
+
# Prepare lake tiering JAR
log_info "Preparing lake tiering JAR..."
copy_jar
"$PROJECT_ROOT/fluss-flink/fluss-flink-tiering/target/fluss-flink-tiering-*.jar"
"./opt" "fluss-flink-tiering"
# Final verification
verify_jars
-
+
# Show summary
show_summary
}
@@ -179,34 +192,36 @@ main() {
# Verify that all required JAR files are present
verify_jars() {
log_info "Verifying all required JAR files are present..."
-
+
local missing_jars=()
local lib_jars=(
"fluss-flink-1.20-*.jar"
"fluss-lake-paimon-*.jar"
+ "fluss-lake-iceberg-*.jar"
"flink-faker-0.5.3.jar"
"hadoop-apache-3.3.5-2.jar"
"paimon-flink-1.20-1.2.0.jar"
+ "iceberg-flink-runtime-1.20-1.9.1.jar"
)
-
+
local opt_jars=(
"fluss-flink-tiering-*.jar"
)
-
+
# Check lib directory
for jar_pattern in "${lib_jars[@]}"; do
if ! ls ./lib/$jar_pattern >/dev/null 2>&1; then
missing_jars+=("lib/$jar_pattern")
fi
done
-
+
# Check opt directory
for jar_pattern in "${opt_jars[@]}"; do
if ! ls ./opt/$jar_pattern >/dev/null 2>&1; then
missing_jars+=("opt/$jar_pattern")
fi
done
-
+
# Report results
if [ ${#missing_jars[@]} -eq 0 ]; then
log_success "All required JAR files are present!"
@@ -224,11 +239,23 @@ show_summary() {
log_success "JAR files preparation completed!"
echo ""
log_info "📦 Generated JAR files:"
- echo "Lib directory:"
- ls -la ./lib/ 2>/dev/null || echo " (empty)"
- echo "Opt directory:"
- ls -la ./opt/ 2>/dev/null || echo " (empty)"
+ echo ""
+ echo "Lib directory (Flink connectors):"
+ ls -lh ./lib/ | tail -n +2 | awk '{printf " %-50s %8s\n", $9, $5}'
+ echo ""
+ echo "Opt directory (Tiering service):"
+ ls -lh ./opt/ | tail -n +2 | awk '{printf " %-50s %8s\n", $9, $5}'
+ echo ""
+ log_info "📋 Included Components:"
+ echo " ✓ Fluss Flink 1.20 connector"
+ echo " ✓ Fluss Lake Paimon connector"
+ echo " ✓ Fluss Lake Iceberg connector"
+ echo " ✓ Iceberg Flink runtime 1.20 (v1.9.1)"
+ echo " ✓ Paimon Flink 1.20 (v1.2.0)"
+ echo " ✓ Hadoop Apache (v3.3.5-2)"
+ echo " ✓ Flink Faker (v0.5.3)"
+ echo " ✓ Fluss Tiering service"
}
# Run main function
-main "$@"
+main "$@"
\ No newline at end of file
diff --git a/fluss-lake/fluss-lake-iceberg/pom.xml
b/fluss-lake/fluss-lake-iceberg/pom.xml
index 94bca76b1..1c13e9db3 100644
--- a/fluss-lake/fluss-lake-iceberg/pom.xml
+++ b/fluss-lake/fluss-lake-iceberg/pom.xml
@@ -318,12 +318,31 @@
<include>*:*</include>
</includes>
</artifactSet>
+ <relocations>
+ <!-- Shade Iceberg to Fluss namespace for
complete isolation -->
+ <relocation>
+ <pattern>org.apache.iceberg</pattern>
+
<shadedPattern>org.apache.fluss.lake.iceberg.shaded.org.apache.iceberg</shadedPattern>
+ </relocation>
+ <!-- Shade Jackson to Fluss namespace -->
+ <relocation>
+ <pattern>com.fasterxml.jackson</pattern>
+
<shadedPattern>org.apache.fluss.lake.iceberg.shaded.com.fasterxml.jackson</shadedPattern>
+ </relocation>
+ <!-- Shade Parquet to Fluss namespace -->
+ <relocation>
+ <pattern>org.apache.parquet</pattern>
+
<shadedPattern>org.apache.fluss.lake.iceberg.shaded.org.apache.parquet</shadedPattern>
+ </relocation>
+ </relocations>
<filters>
<filter>
<artifact>*</artifact>
<excludes>
<exclude>LICENSE</exclude>
<exclude>NOTICE</exclude>
+ <!-- Exclude multi-release JAR
versions that cause shading issues -->
+ <exclude>META-INF/versions/**</exclude>
</excludes>
</filter>
</filters>
diff --git a/website/docs/quickstart/flink-iceberg.md
b/website/docs/quickstart/flink-iceberg.md
index 7a3ac8789..26d44ada4 100644
--- a/website/docs/quickstart/flink-iceberg.md
+++ b/website/docs/quickstart/flink-iceberg.md
@@ -63,7 +63,6 @@ services:
coordinator-server:
image: fluss/fluss:$FLUSS_DOCKER_VERSION$
- command: coordinatorServer
depends_on:
- zookeeper
environment:
@@ -401,8 +400,7 @@ CREATE TABLE datalake_enriched_orders (
`cust_phone` STRING,
`cust_acctbal` DECIMAL(15, 2),
`cust_mktsegment` STRING,
- `nation_name` STRING,
- PRIMARY KEY (`order_key`) NOT ENFORCED
+ `nation_name` STRING
) WITH (
'table.datalake.enabled' = 'true',
'table.datalake.freshness' = '30s'
@@ -429,11 +427,14 @@ SELECT o.order_key,
c.acctbal,
c.mktsegment,
n.name
-FROM fluss_order o
- LEFT JOIN fluss_customer FOR SYSTEM_TIME AS OF `o`.`ptime` AS `c`
- ON o.cust_key = c.cust_key
- LEFT JOIN fluss_nation FOR SYSTEM_TIME AS OF `o`.`ptime` AS `n`
- ON c.nation_key = n.nation_key;
+FROM (
+ SELECT *, PROCTIME() as ptime
+ FROM `default_catalog`.`default_database`.source_order
+) o
+LEFT JOIN fluss_customer FOR SYSTEM_TIME AS OF o.ptime AS c
+ ON o.cust_key = c.cust_key
+LEFT JOIN fluss_nation FOR SYSTEM_TIME AS OF o.ptime AS n
+ ON c.nation_key = n.nation_key;
```
### Real-Time Analytics on Fluss datalake-enabled Tables
@@ -459,11 +460,12 @@ SELECT snapshot_id, operation FROM
datalake_enriched_orders$lake$snapshots;
**Sample Output:**
```shell
-+-------------+--------------------+
-| snapshot_id | operation |
-+-------------+--------------------+
-| 1 | append |
-+-------------+--------------------+
++---------------------+-----------+
+| snapshot_id | operation |
++---------------------+-----------+
+| 7792523713868625335 | append |
+| 7960217942125627573 | append |
++---------------------+-----------+
```
**Note:** Make sure to wait for the configured `datalake.freshness` (~30s) to
complete before querying the snapshots, otherwise the result will be empty.
@@ -474,11 +476,11 @@ SELECT sum(total_price) as sum_price FROM
datalake_enriched_orders$lake;
```
**Sample Output:**
```shell
-+------------+
-| sum_price |
-+------------+
-| 1669519.92 |
-+------------+
++-----------+
+| sum_price |
++-----------+
+| 432880.93 |
++-----------+
```
To achieve results with sub-second data freshness, you can query the table
directly, which seamlessly unifies data from both Fluss and Iceberg:
@@ -490,23 +492,23 @@ SELECT sum(total_price) as sum_price FROM
datalake_enriched_orders;
**Sample Output:**
```shell
-+------------+
-| sum_price |
-+------------+
-| 1777908.36 |
-+------------+
++-----------+
+| sum_price |
++-----------+
+| 558660.03 |
++-----------+
```
You can execute the real-time analytics query multiple times, and the results
will vary with each run as new data is continuously written to Fluss in
real-time.
Finally, you can use the following command to view the files stored in Iceberg:
```shell
-docker compose exec taskmanager tree /tmp/iceberg/fluss.db
+docker compose exec taskmanager tree /tmp/iceberg/fluss
```
**Sample Output:**
```shell
-/tmp/iceberg/fluss.db
+/tmp/iceberg/fluss
└── datalake_enriched_orders
├── data
│ └── 00000-0-abc123.parquet