This is an automated email from the ASF dual-hosted git repository.
danny0405 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hudi.git
The following commit(s) were added to refs/heads/master by this push:
new 65c1b1217ece feat: Notebooks to support multiple hudi versions (#18255)
65c1b1217ece is described below
commit 65c1b1217ece29b7a5581dcd762ca00773dd5c20
Author: Ranga Reddy <[email protected]>
AuthorDate: Mon Mar 2 09:37:07 2026 +0530
feat: Notebooks to support multiple hudi versions (#18255)
---
hudi-notebooks/Dockerfile.spark | 49 +++-
hudi-notebooks/build.sh | 22 +-
hudi-notebooks/conf/spark/spark-defaults.conf | 5 -
hudi-notebooks/docker-compose.yml | 4 +-
hudi-notebooks/notebooks/01-crud-operations.ipynb | 2 +-
hudi-notebooks/notebooks/02-query-types.ipynb | 2 +-
.../notebooks/03-scd-type2_and_type4.ipynb | 2 +-
hudi-notebooks/notebooks/04-schema-evolution.ipynb | 2 +-
.../notebooks/05-mastering-sql-procedures.ipynb | 2 +-
.../notebooks/06_hudi_trino_example.ipynb | 325 +++++++++++++++++++++
.../notebooks/07_hudi_presto_example.ipynb | 325 +++++++++++++++++++++
hudi-notebooks/notebooks/utils.py | 191 +++++++-----
hudi-notebooks/requirements.txt | 6 +
hudi-notebooks/run_spark_hudi.sh | 2 +
14 files changed, 830 insertions(+), 109 deletions(-)
diff --git a/hudi-notebooks/Dockerfile.spark b/hudi-notebooks/Dockerfile.spark
index 18f5d06274cb..7a9b2650bafd 100644
--- a/hudi-notebooks/Dockerfile.spark
+++ b/hudi-notebooks/Dockerfile.spark
@@ -14,33 +14,52 @@
# See the License for the specific language governing permissions and
# limitations under the License.
-ARG SPARK_VERSION=${SPARK_VERSION:-3.5.7}
-
-FROM apache/spark:$SPARK_VERSION-scala2.12-java17-python3-ubuntu
+ARG SPARK_VERSION=${SPARK_VERSION:-3.4.4}
+ARG JAVA_VERSION=${JAVA_VERSION:-11}
+ARG SCALA_VERSION=${SCALA_VERSION:-2.12}
+ARG PYTHON_VERSION=${PYTHON_VERSION:-3}
+FROM
apache/spark:$SPARK_VERSION-scala$SCALA_VERSION-java$JAVA_VERSION-python$PYTHON_VERSION-ubuntu
USER root
-ARG HADOOP_VERSION=${HADOOP_VERSION:-3.3.4} \
+ARG SPARK_VERSION=${SPARK_VERSION:-3.4.4} \
+ HADOOP_VERSION=${HADOOP_VERSION:-3.3.4} \
AWS_SDK_VERSION=${AWS_SDK_VERSION:-1.12.772} \
MVN_REPO_URL=https://repo1.maven.org/maven2 \
- HUDI_VERSION=${HUDI_VERSION:-1.0.2}
+ HUDI_VERSION=${HUDI_VERSION:-1.0.2} \
+ SCALA_VERSION=${SCALA_VERSION:-2.12} \
+ HUDI_HOME=${HUDI_HOME:-/opt/hudi} \
+ NOTEBOOK_HOME=${NOTEBOOK_HOME:-/opt/notebooks}
-ENV PATH=$SPARK_HOME/bin:$PATH \
- NOTEBOOK_HOME=/opt/notebooks \
- HUDI_VERSION=$HUDI_VERSION \
+ENV SPARK_VERSION=$SPARK_VERSION \
+ SCALA_VERSION=$SCALA_VERSION \
+ PATH=$SPARK_HOME/bin:$PATH \
+ NOTEBOOK_HOME=$NOTEBOOK_HOME \
+ HUDI_HOME=$HUDI_HOME \
AWS_JAVA_V1_DISABLE_DEPRECATION_ANNOUNCEMENT=true
-RUN mkdir -p $HUDI_HOME $NOTEBOOK_HOME && \
- wget -O $SPARK_HOME/jars/hadoop-aws.jar
$MVN_REPO_URL/org/apache/hadoop/hadoop-aws/$HADOOP_VERSION/hadoop-aws-$HADOOP_VERSION.jar
&& \
- wget -O $SPARK_HOME/jars/aws-java-sdk-bundle.jar
$MVN_REPO_URL/com/amazonaws/aws-java-sdk-bundle/$AWS_SDK_VERSION/aws-java-sdk-bundle-$AWS_SDK_VERSION.jar
&& \
- wget -O $SPARK_HOME/jars/hudi-spark3.5-bundle_2.12-$HUDI_VERSION.jar
$MVN_REPO_URL/org/apache/hudi/hudi-spark3.5-bundle_2.12/$HUDI_VERSION/hudi-spark3.5-bundle_2.12-$HUDI_VERSION.jar
+ARG SPARK_MINOR_VERSION=${SPARK_VERSION%.*}
+ARG HUDI_SPARK_BUNDLE=hudi-spark${SPARK_MINOR_VERSION}-bundle_$SCALA_VERSION
+ARG HUDI_SPARK_BUNDLE_JAR=$HUDI_SPARK_BUNDLE-$HUDI_VERSION.jar
+
+RUN mkdir -p ${HUDI_HOME}/${HUDI_VERSION} $NOTEBOOK_HOME && \
+ wget -O $SPARK_HOME/jars/hadoop-aws.jar \
+
$MVN_REPO_URL/org/apache/hadoop/hadoop-aws/$HADOOP_VERSION/hadoop-aws-$HADOOP_VERSION.jar
&& \
+ wget -O $SPARK_HOME/jars/aws-java-sdk-bundle.jar \
+
$MVN_REPO_URL/com/amazonaws/aws-java-sdk-bundle/$AWS_SDK_VERSION/aws-java-sdk-bundle-$AWS_SDK_VERSION.jar
&& \
+ wget -O ${HUDI_HOME}/${HUDI_VERSION}/${HUDI_SPARK_BUNDLE_JAR} \
+
$MVN_REPO_URL/org/apache/hudi/${HUDI_SPARK_BUNDLE}/${HUDI_VERSION}/${HUDI_SPARK_BUNDLE_JAR}
+
+COPY requirements.txt /tmp/requirements.txt
RUN apt-get update && \
- apt-get --fix-broken install -y --no-install-recommends python3-pip cargo
&& \
+ apt-get install -y --no-install-recommends python3-pip cargo && \
pip3 install --upgrade pip && \
- pip3 install jupyter pandas numpy hudi>=0.4.0 boto3 && \
+ pip3 install --no-cache-dir -r /tmp/requirements.txt && \
ln -sf /usr/bin/python3 /usr/bin/python && \
- apt-get clean && rm -rf /var/lib/apt/lists/*
+ apt-get clean && \
+ rm -rf /var/lib/apt/lists/* /tmp/requirements.txt
+
COPY notebooks/ $NOTEBOOK_HOME/
COPY conf/spark/ $SPARK_HOME/conf/
diff --git a/hudi-notebooks/build.sh b/hudi-notebooks/build.sh
index 3c4e9f306de9..d4fb1013cc75 100644
--- a/hudi-notebooks/build.sh
+++ b/hudi-notebooks/build.sh
@@ -15,25 +15,33 @@
# See the License for the specific language governing permissions and
# limitations under the License.
-set -eux
+set -euo pipefail
-export HUDI_VERSION=1.0.2
+export HUDI_VERSION=${HUDI_VERSION:-1.0.2}
export HUDI_VERSION_TAG=${HUDI_VERSION}
-export SPARK_VERSION=3.5.7
-export HIVE_VERSION=3.1.3
+export SPARK_VERSION=${SPARK_VERSION:-3.4.4}
+export HIVE_VERSION=${HIVE_VERSION:-3.1.3}
export HIVE_VERSION_TAG=${HIVE_VERSION}
-export TRINO_VERSION=477
+export TRINO_VERSION=${TRINO_VERSION:-477}
export TRINO_VERSION_TAG=${TRINO_VERSION}
-export PRESTO_VERSION=0.296
+export PRESTO_VERSION=${PRESTO_VERSION:-0.296}
export PRESTO_VERSION_TAG=${PRESTO_VERSION}
+export JAVA_VERSION=${JAVA_VERSION:-11}
+export SCALA_VERSION=${SCALA_VERSION:-2.12}
+export HADOOP_VERSION=${HADOOP_VERSION:-3.3.4}
+export AWS_SDK_VERSION=${AWS_SDK_VERSION:-1.12.772}
-SCRIPT_DIR=$(cd $(dirname $0); pwd)
+SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
echo "Building Spark Hudi Docker image using Spark version: $SPARK_VERSION and
Hudi version: $HUDI_VERSION"
docker build \
--build-arg HUDI_VERSION="$HUDI_VERSION" \
--build-arg SPARK_VERSION="$SPARK_VERSION" \
+ --build-arg JAVA_VERSION="$JAVA_VERSION" \
+ --build-arg SCALA_VERSION="$SCALA_VERSION" \
+ --build-arg HADOOP_VERSION="$HADOOP_VERSION" \
+ --build-arg AWS_SDK_VERSION="$AWS_SDK_VERSION" \
-t apachehudi/spark-hudi:latest \
-t apachehudi/spark-hudi:"$HUDI_VERSION_TAG" \
-f "$SCRIPT_DIR"/Dockerfile.spark .
diff --git a/hudi-notebooks/conf/spark/spark-defaults.conf
b/hudi-notebooks/conf/spark/spark-defaults.conf
index c1c0a9c0dea7..7b58425033d6 100644
--- a/hudi-notebooks/conf/spark/spark-defaults.conf
+++ b/hudi-notebooks/conf/spark/spark-defaults.conf
@@ -22,11 +22,6 @@ spark.sql.warehouse.dir=s3a://warehouse/
spark.hadoop.hive.metastore.warehouse.dir=s3a://warehouse/
spark.hadoop.hive.metastore.uris=thrift://hive-metastore:9083
-spark.serializer=org.apache.spark.serializer.KryoSerializer
-spark.sql.extensions=org.apache.spark.sql.hudi.HoodieSparkSessionExtension
-spark.sql.catalog.spark_catalog=org.apache.spark.sql.hudi.catalog.HoodieCatalog
-spark.kryo.registrator=org.apache.spark.HoodieSparkKryoRegistrar
-
spark.hadoop.fs.s3a.endpoint=http://minio:9000
spark.hadoop.fs.s3a.access.key=admin
spark.hadoop.fs.s3a.secret.key=password
diff --git a/hudi-notebooks/docker-compose.yml
b/hudi-notebooks/docker-compose.yml
index daff86d5adf8..e4a106a1d791 100644
--- a/hudi-notebooks/docker-compose.yml
+++ b/hudi-notebooks/docker-compose.yml
@@ -1,5 +1,3 @@
-#!/bin/bash
-
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
@@ -108,7 +106,7 @@ services:
- hive-metastore
- minio
ports:
- - "8086:8080" # Trino Web UI
+ - "8086:8080" # Presto Web UI
networks:
- hudi-datalake
diff --git a/hudi-notebooks/notebooks/01-crud-operations.ipynb
b/hudi-notebooks/notebooks/01-crud-operations.ipynb
index 8c9c9bcadba9..264a6f9ccc60 100644
--- a/hudi-notebooks/notebooks/01-crud-operations.ipynb
+++ b/hudi-notebooks/notebooks/01-crud-operations.ipynb
@@ -855,7 +855,7 @@
"metadata": {},
"outputs": [],
"source": [
- "spark.stop()"
+ "stop_spark_session()"
]
}
],
diff --git a/hudi-notebooks/notebooks/02-query-types.ipynb
b/hudi-notebooks/notebooks/02-query-types.ipynb
index f588760479a3..118a57fe5fae 100644
--- a/hudi-notebooks/notebooks/02-query-types.ipynb
+++ b/hudi-notebooks/notebooks/02-query-types.ipynb
@@ -801,7 +801,7 @@
"metadata": {},
"outputs": [],
"source": [
- "spark.stop()"
+ "stop_spark_session()"
]
}
],
diff --git a/hudi-notebooks/notebooks/03-scd-type2_and_type4.ipynb
b/hudi-notebooks/notebooks/03-scd-type2_and_type4.ipynb
index 144d4a922bf8..f900acccb293 100644
--- a/hudi-notebooks/notebooks/03-scd-type2_and_type4.ipynb
+++ b/hudi-notebooks/notebooks/03-scd-type2_and_type4.ipynb
@@ -483,7 +483,7 @@
"metadata": {},
"outputs": [],
"source": [
- "spark.stop()"
+ "stop_spark_session()"
]
}
],
diff --git a/hudi-notebooks/notebooks/04-schema-evolution.ipynb
b/hudi-notebooks/notebooks/04-schema-evolution.ipynb
index ac34c9d4a70f..170183ef8ddf 100644
--- a/hudi-notebooks/notebooks/04-schema-evolution.ipynb
+++ b/hudi-notebooks/notebooks/04-schema-evolution.ipynb
@@ -438,7 +438,7 @@
"metadata": {},
"outputs": [],
"source": [
- "spark.stop()"
+ "stop_spark_session()"
]
}
],
diff --git a/hudi-notebooks/notebooks/05-mastering-sql-procedures.ipynb
b/hudi-notebooks/notebooks/05-mastering-sql-procedures.ipynb
index 170c03f46e90..a6da781caf52 100644
--- a/hudi-notebooks/notebooks/05-mastering-sql-procedures.ipynb
+++ b/hudi-notebooks/notebooks/05-mastering-sql-procedures.ipynb
@@ -1277,7 +1277,7 @@
"metadata": {},
"outputs": [],
"source": [
- "spark.stop()"
+ "stop_spark_session()"
]
}
],
diff --git a/hudi-notebooks/notebooks/06_hudi_trino_example.ipynb
b/hudi-notebooks/notebooks/06_hudi_trino_example.ipynb
new file mode 100644
index 000000000000..6f8990385bb7
--- /dev/null
+++ b/hudi-notebooks/notebooks/06_hudi_trino_example.ipynb
@@ -0,0 +1,325 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "36835b8e-37cb-4c86-ad7f-c0322ffcd7cc",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Licensed to the Apache Software Foundation (ASF) under one\n",
+ "# or more contributor license agreements. See the NOTICE file\n",
+ "# distributed with this work for additional information\n",
+ "# regarding copyright ownership. The ASF licenses this file\n",
+ "# to you under the Apache License, Version 2.0 (the\n",
+ "# \"License\"); you may not use this file except in compliance\n",
+ "# with the License. You may obtain a copy of the License at\n",
+ "#\n",
+ "# http://www.apache.org/licenses/LICENSE-2.0\n",
+ "#\n",
+ "# Unless required by applicable law or agreed to in writing, software\n",
+ "# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
+ "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied.\n",
+ "# See the License for the specific language governing permissions and\n",
+ "# limitations under the License."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "59e2c4b5-2645-453f-8ae5-8475becdf620",
+ "metadata": {},
+ "source": [
+ "<center>\n",
+ "<img src=\"https://hudi.apache.org/assets/images/hudi-logo-medium.png\"
alt=\"Hudi logo\" width=\"100%\" height=\"320\"/>\n",
+ "</center>"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "b97d03aa-5387-4a55-aa10-a5fc2511ad5e",
+ "metadata": {},
+ "source": [
+ "# Querying Hudi Tables using Trino: A Step-by-Step Guide"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "c5fa84d4-0ad7-47d0-a9ec-38d7059679c6",
+ "metadata": {},
+ "source": [
+ "This guide demonstrates a cross-engine workflow: writing optimized
Lakehouse tables with Apache Spark and querying them with Trino for fast,
interactive analytics."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "34c4ce1c-0342-4f6c-a296-864a0311ef6e",
+ "metadata": {},
+ "source": [
+ "### 1. Library Imports"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "2ae14e01-3fb0-4fb8-be7d-9e8520df2941",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import os\n",
+ "import trino.dbapi\n",
+ "import pandas as pd"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "04e41c4c-8f59-470f-bcca-12b6cf9b4ac0",
+ "metadata": {},
+ "source": [
+ "### 2. Global Configuration"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "b4a448fc-39a5-4082-be72-eb73f1e77cc8",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "db_name = \"trino_db\"\n",
+ "table_name = \"hudi_trips_table\"\n",
+ "s3_base_path = f\"s3a://warehouse/\"\n",
+ "base_path = os.path.join(s3_base_path, db_name, table_name)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "e0add876-ce06-4377-ac8d-60a3b4968921",
+ "metadata": {},
+ "source": [
+ "### 3. Spark Session Initialization"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "f5eb5c7b-6d2e-458f-92df-70befc8edba0",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "%run utils.py"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "89ec9d7c-7fe7-40d0-876b-60c633af93ad",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "spark = get_spark_session(app_name = \"Hudi Trino Example\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "13d519af-aee1-4f66-b2ba-801ce73587e1",
+ "metadata": {},
+ "source": [
+ "### 4. Sample Data Generation"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "b2bc1bbf-7106-4442-8d5a-ac7b6f5f9d53",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "columns = [\"ts\", \"uuid\", \"rider\", \"driver\", \"fare\",
\"city\"]\n",
+ "\n",
+ "data = [\n",
+ " (\"2025-08-10 08:15:30\", \"uuid-001\", \"rider-A\", \"driver-X\",
18.50, \"new_york\"),\n",
+ " (\"2025-08-10 09:22:10\", \"uuid-002\", \"rider-B\", \"driver-Y\",
22.75, \"san_francisco\"),\n",
+ " (\"2025-08-10 10:05:45\", \"uuid-003\", \"rider-C\", \"driver-Z\",
14.60, \"chicago\"),\n",
+ " (\"2025-08-10 11:40:00\", \"uuid-004\", \"rider-D\", \"driver-W\",
31.90, \"new_york\"),\n",
+ " (\"2025-08-10 12:55:15\", \"uuid-005\", \"rider-E\", \"driver-V\",
25.10, \"san_francisco\"),\n",
+ " (\"2025-08-10 13:20:35\", \"uuid-006\", \"rider-F\", \"driver-U\",
19.80, \"chicago\"),\n",
+ " (\"2025-08-10 14:10:05\", \"uuid-007\", \"rider-G\", \"driver-T\",
28.45, \"san_francisco\"),\n",
+ " (\"2025-08-10 15:00:20\", \"uuid-008\", \"rider-H\", \"driver-S\",
16.25, \"new_york\"),\n",
+ " (\"2025-08-10 15:45:50\", \"uuid-009\", \"rider-I\", \"driver-R\",
24.35, \"chicago\"),\n",
+ " (\"2025-08-10 16:30:00\", \"uuid-010\", \"rider-J\", \"driver-Q\",
20.00, \"new_york\")\n",
+ "]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "19e34d17-66ab-492b-953c-dd8a30392ba3",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "input_df = spark.createDataFrame(data).toDF(*columns)\n",
+ "display(input_df, 5)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "278e9aa4-cc2b-4af8-9c1d-11f186ca4954",
+ "metadata": {},
+ "source": [
+ "### 5. Hudi Write Configuration & Ingestion"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "26ca967e-cbf2-417d-b0d9-215b800dee80",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "hudi_write_options = {\n",
+ " \"hoodie.table.name\" : table_name,\n",
+ " \"hoodie.datasource.write.recordkey.field\": \"uuid\",\n",
+ " \"hoodie.datasource.write.precombine.field\": \"ts\",\n",
+ " \"hoodie.datasource.write.partitionpath.field\": \"city\",\n",
+ " \"hoodie.metadata.enable\": \"true\",\n",
+ " \"hoodie.datasource.write.hive_style_partitioning\": \"true\",\n",
+ " \"hoodie.datasource.meta.sync.enable\": \"true\",\n",
+ " \"hoodie.datasource.hive_sync.partition_fields\": \"city\",\n",
+ " \"hoodie.datasource.hive_sync.mode\": \"hms\",\n",
+ " \"hoodie.datasource.hive_sync.metastore.uris\":
\"thrift://hive-metastore:9083\",\n",
+ " \"hoodie.datasource.hive_sync.database\": db_name,\n",
+ " \"hoodie.datasource.hive_sync.table\": table_name\n",
+ "}"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "b62958ce-d753-478a-ad40-a6850273144f",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Write the data to Hudi\n",
+ "input_df.write.format(\"hudi\") \\\n",
+ " .options(**hudi_write_options) \\\n",
+ " .mode(\"overwrite\") \\\n",
+ " .save(base_path)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "4c01c17e-023b-4ff2-8e02-4fd57c50fc7d",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df = spark.sql(f\"SELECT * FROM {db_name}.{table_name}\")\n",
+ "display(df, 5)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "b7226027-2b03-4b68-be4c-f6194a26f6b1",
+ "metadata": {},
+ "source": [
+ "### 6. Querying with Trino"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "88b97899-19f3-4662-a98f-28ec7c04354e",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Establish Trino Connection\n",
+ "TRINO_HOST='trino'\n",
+ "TRINO_PORT=8080\n",
+ "TRINO_CATALOG='hudi'\n",
+ "TRINO_SCHEMA=db_name\n",
+ "\n",
+ "conn = trino.dbapi.connect(\n",
+ " host=TRINO_HOST,\n",
+ " port=TRINO_PORT,\n",
+ " user='trino', \n",
+ " catalog=TRINO_CATALOG,\n",
+ " schema=TRINO_SCHEMA\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "b77c5e2d-83a6-4ed7-a894-f26af7e02ce2",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Query the Data from Trino\n",
+ "try:\n",
+ " query = f\"SELECT * FROM {table_name} LIMIT 100\"\n",
+ " cur = conn.cursor()\n",
+ " cur.execute(query)\n",
+ "\n",
+ " rows = cur.fetchall()\n",
+ " colnames = [desc[0] for desc in cur.description]\n",
+ " pandas_df = pd.DataFrame(rows, columns=colnames)\n",
+ "finally:\n",
+ " cur.close()\n",
+ "\n",
+ "df = spark.createDataFrame(pandas_df)\n",
+ "display(df, 5)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "e01c683b-cbb3-4149-81ab-b4d8df7a2e8a",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Close the Trino Connection\n",
+ "conn.close()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "4912d31b-aeef-40f1-86e3-49f26c9f415c",
+ "metadata": {},
+ "source": [
+ "### 7. Cleanup"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "97dc0033-5043-4858-a053-f41afb833cce",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Stop the Spark Session\n",
+ "stop_spark_session()"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.8.10"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/hudi-notebooks/notebooks/07_hudi_presto_example.ipynb
b/hudi-notebooks/notebooks/07_hudi_presto_example.ipynb
new file mode 100644
index 000000000000..8ef3a4f89a17
--- /dev/null
+++ b/hudi-notebooks/notebooks/07_hudi_presto_example.ipynb
@@ -0,0 +1,325 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "18c86d5b-7c3c-49dc-83f0-65b76fc5b48b",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Licensed to the Apache Software Foundation (ASF) under one\n",
+ "# or more contributor license agreements. See the NOTICE file\n",
+ "# distributed with this work for additional information\n",
+ "# regarding copyright ownership. The ASF licenses this file\n",
+ "# to you under the Apache License, Version 2.0 (the\n",
+ "# \"License\"); you may not use this file except in compliance\n",
+ "# with the License. You may obtain a copy of the License at\n",
+ "#\n",
+ "# http://www.apache.org/licenses/LICENSE-2.0\n",
+ "#\n",
+ "# Unless required by applicable law or agreed to in writing, software\n",
+ "# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
+ "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied.\n",
+ "# See the License for the specific language governing permissions and\n",
+ "# limitations under the License."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "c99256bf-9ad4-4b47-b00d-56ed0827b324",
+ "metadata": {},
+ "source": [
+ "<center>\n",
+ "<img src=\"https://hudi.apache.org/assets/images/hudi-logo-medium.png\"
alt=\"Hudi logo\" width=\"100%\" height=\"320\"/>\n",
+ "</center>"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "d06aec4e-8c50-46c9-8f72-86be2a3a54c2",
+ "metadata": {},
+ "source": [
+ "# Querying Hudi Tables using Presto: A Step-by-Step Guide"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "e9dcd814-8af6-44b1-90a0-033151435c0b",
+ "metadata": {},
+ "source": [
+ "This guide demonstrates a decoupled Data Lakehouse architecture,
showcasing how to ingest and optimize data using Apache Spark and subsequently
execute high-performance interactive analytics using Presto."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "429bbf4b-7112-475a-aab8-3248b208c743",
+ "metadata": {},
+ "source": [
+ "### 1. Library Imports"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "2ae14e01-3fb0-4fb8-be7d-9e8520df2941",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import os\n",
+ "import prestodb\n",
+ "import pandas as pd"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "38595256-2537-4569-829d-6af6c510a1c6",
+ "metadata": {},
+ "source": [
+ "### 2. Global Configuration"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "b4a448fc-39a5-4082-be72-eb73f1e77cc8",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "db_name = \"presto_db\"\n",
+ "table_name = \"hudi_trips_table\"\n",
+ "s3_base_path = f\"s3a://warehouse/\"\n",
+ "base_path = os.path.join(s3_base_path, db_name, table_name)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "f9a4338d-ebc2-4e80-9bcd-5dbbb6e2076c",
+ "metadata": {},
+ "source": [
+ "### 3. Spark Session Initialization"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "f5eb5c7b-6d2e-458f-92df-70befc8edba0",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "%run utils.py"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "89ec9d7c-7fe7-40d0-876b-60c633af93ad",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "spark = get_spark_session(app_name = \"Hudi Presto Example\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "5fc05f60-4972-4e40-8256-0bbd5b04a2ff",
+ "metadata": {},
+ "source": [
+ "### 4. Sample Data Generation"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "b2bc1bbf-7106-4442-8d5a-ac7b6f5f9d53",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "columns = [\"ts\", \"uuid\", \"rider\", \"driver\", \"fare\",
\"city\"]\n",
+ "\n",
+ "data = [\n",
+ " (\"2025-08-10 08:15:30\", \"uuid-001\", \"rider-A\", \"driver-X\",
18.50, \"new_york\"),\n",
+ " (\"2025-08-10 09:22:10\", \"uuid-002\", \"rider-B\", \"driver-Y\",
22.75, \"san_francisco\"),\n",
+ " (\"2025-08-10 10:05:45\", \"uuid-003\", \"rider-C\", \"driver-Z\",
14.60, \"chicago\"),\n",
+ " (\"2025-08-10 11:40:00\", \"uuid-004\", \"rider-D\", \"driver-W\",
31.90, \"new_york\"),\n",
+ " (\"2025-08-10 12:55:15\", \"uuid-005\", \"rider-E\", \"driver-V\",
25.10, \"san_francisco\"),\n",
+ " (\"2025-08-10 13:20:35\", \"uuid-006\", \"rider-F\", \"driver-U\",
19.80, \"chicago\"),\n",
+ " (\"2025-08-10 14:10:05\", \"uuid-007\", \"rider-G\", \"driver-T\",
28.45, \"san_francisco\"),\n",
+ " (\"2025-08-10 15:00:20\", \"uuid-008\", \"rider-H\", \"driver-S\",
16.25, \"new_york\"),\n",
+ " (\"2025-08-10 15:45:50\", \"uuid-009\", \"rider-I\", \"driver-R\",
24.35, \"chicago\"),\n",
+ " (\"2025-08-10 16:30:00\", \"uuid-010\", \"rider-J\", \"driver-Q\",
20.00, \"new_york\")\n",
+ "]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "19e34d17-66ab-492b-953c-dd8a30392ba3",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "input_df = spark.createDataFrame(data).toDF(*columns)\n",
+ "display(input_df, 5)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "1876aff9-f65c-4ed1-b356-b3f6bb55f355",
+ "metadata": {},
+ "source": [
+ "### 5. Hudi Write Configuration & Ingestion"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "26ca967e-cbf2-417d-b0d9-215b800dee80",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "hudi_write_options = {\n",
+ " \"hoodie.table.name\" : table_name,\n",
+ " \"hoodie.datasource.write.recordkey.field\": \"uuid\",\n",
+ " \"hoodie.datasource.write.precombine.field\": \"ts\",\n",
+ " \"hoodie.datasource.write.partitionpath.field\": \"city\",\n",
+ " \"hoodie.metadata.enable\": \"true\",\n",
+ " \"hoodie.datasource.write.hive_style_partitioning\": \"true\",\n",
+ " \"hoodie.datasource.meta.sync.enable\": \"true\",\n",
+ " \"hoodie.datasource.hive_sync.partition_fields\": \"city\",\n",
+ " \"hoodie.datasource.hive_sync.mode\": \"hms\",\n",
+ " \"hoodie.datasource.hive_sync.metastore.uris\":
\"thrift://hive-metastore:9083\",\n",
+ " \"hoodie.datasource.hive_sync.database\": db_name,\n",
+ " \"hoodie.datasource.hive_sync.table\": table_name\n",
+ "}"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "b62958ce-d753-478a-ad40-a6850273144f",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Write the data to Hudi\n",
+ "input_df.write.format(\"hudi\") \\\n",
+ " .options(**hudi_write_options) \\\n",
+ " .mode(\"overwrite\") \\\n",
+ " .save(base_path)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "4c01c17e-023b-4ff2-8e02-4fd57c50fc7d",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df = spark.sql(f\"SELECT * FROM {db_name}.{table_name}\")\n",
+ "display(df, 5)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "a7aef5bc-9324-4106-8cd6-b806e341081d",
+ "metadata": {},
+ "source": [
+ "### 6. Querying with Presto"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "88b97899-19f3-4662-a98f-28ec7c04354e",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Establish Presto Connection\n",
+ "PRESTO_HOST='presto'\n",
+ "PRESTO_PORT=8080\n",
+ "PRESTO_CATALOG='hudi'\n",
+ "PRESTO_SCHEMA=db_name\n",
+ "\n",
+ "conn = prestodb.dbapi.connect(\n",
+ " host=PRESTO_HOST,\n",
+ " port=PRESTO_PORT,\n",
+ " user='presto',\n",
+ " catalog=PRESTO_CATALOG,\n",
+ " schema=PRESTO_SCHEMA,\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "b77c5e2d-83a6-4ed7-a894-f26af7e02ce2",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Query the Data from Presto\n",
+ "try:\n",
+ " query = f\"SELECT * FROM {table_name} LIMIT 100\"\n",
+ " cur = conn.cursor()\n",
+ " cur.execute(query)\n",
+ "\n",
+ " rows = cur.fetchall()\n",
+ " colnames = [desc[0] for desc in cur.description]\n",
+ " pandas_df = pd.DataFrame(rows, columns=colnames)\n",
+ "finally:\n",
+ " cur.close()\n",
+ "\n",
+ "df = spark.createDataFrame(pandas_df)\n",
+ "display(df, 5)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "e01c683b-cbb3-4149-81ab-b4d8df7a2e8a",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Close the Presto Connection\n",
+ "conn.close()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "8ce78347-b4de-442c-9790-f1ae372df363",
+ "metadata": {},
+ "source": [
+ "### 7. Cleanup"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "97dc0033-5043-4858-a053-f41afb833cce",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Stop the Spark Session\n",
+ "stop_spark_session()"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.8.10"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/hudi-notebooks/notebooks/utils.py
b/hudi-notebooks/notebooks/utils.py
index 5318a1340cef..62e5c2af41fb 100644
--- a/hudi-notebooks/notebooks/utils.py
+++ b/hudi-notebooks/notebooks/utils.py
@@ -14,50 +14,125 @@
# See the License for the specific language governing permissions and
# limitations under the License.
+import os
+
from pyspark.sql import SparkSession
-from IPython.display import display as display_html, HTML
-import boto3
-from urllib.parse import urlparse
+from IPython.display import HTML, display as display_html
+
+_spark = None
+
+# Default number of rows to show in display()
+DEFAULT_DISPLAY_ROWS = 100
-def get_spark_session(app_name="Hudi-Notebooks", log_level="WARN"):
+# Reused HTML/CSS for DataFrame display (avoids string rebuild on every call)
+_DISPLAY_TABLE_CSS = """
+<style>
+ .dataframe {
+ border-radius: 0.5rem;
+ box-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.1), 0 2px 4px -1px rgba(0,
0, 0, 0.06);
+ overflow-x: auto;
+ border: 1px solid #e2e8f0;
+ }
+ .dataframe th {
+ background-color: #f1f5f9;
+ color: #1f2937;
+ font-weight: 600;
+ padding: 0.75rem 1.5rem;
+ text-align: left;
+ border-bottom: 2px solid #e2e8f0;
+ }
+ .dataframe td {
+ padding: 0.75rem 1.5rem;
+ border-bottom: 1px solid #e2e8f0;
+ }
+ .dataframe tr:nth-child(even) {
+ background-color: #f8fafc;
+ }
+ .dataframe tr:hover {
+ background-color: #e2e8f0;
+ transition: background-color 0.2s ease-in-out;
+ }
+</style>
+"""
+
+
+def get_spark_session(
+ app_name="Hudi-Notebooks",
+ log_level="WARN",
+ hudi_version="1.0.2",
+):
"""
- Initialize a SparkSession
-
+ Initialize a SparkSession (singleton).
+
Parameters:
- app_name (str): Optional name for the Spark application.
- log_level (str): Log level for Spark (DEBUG, INFO, WARN, ERROR).
Defaults to WARN.
-
+ - hudi_version (str): Hudi bundle version. Defaults to 1.0.2.
+
Returns:
- SparkSession object
"""
-
- spark_session = SparkSession.builder \
- .appName(app_name) \
- .config("spark.hadoop.fs.defaultFS", "s3a://warehouse") \
- .config("spark.log.level", log_level) \
- .enableHiveSupport() \
+ global _spark
+
+ if _spark is not None:
+ return _spark
+
+ hudi_home = os.getenv("HUDI_HOME")
+ spark_version = os.getenv("SPARK_VERSION", "3.5.7")
+ spark_minor_version = ".".join(spark_version.split(".")[:2])
+ scala_version = os.getenv("SCALA_VERSION", "2.12")
+ hudi_jar_path =
f"hudi-spark{spark_minor_version}-bundle_{scala_version}-{hudi_version}.jar"
+ hudi_packages =
f"org.apache.hudi:hudi-spark{spark_minor_version}-bundle_{scala_version}:{hudi_version}"
+ hudi_jars = os.path.join(hudi_home, hudi_version, hudi_jar_path)
+ use_jars = os.path.exists(hudi_jars)
+ conf_param = "spark.jars" if use_jars else "spark.jars.packages"
+ conf_value = hudi_jars if use_jars else hudi_packages
+
+ _spark = (
+ SparkSession.builder.appName(app_name)
+ .config(conf_param, conf_value)
+ .config("spark.hadoop.fs.defaultFS", "s3a://warehouse")
+ .config("spark.serializer",
"org.apache.spark.serializer.KryoSerializer")
+ .config("spark.sql.extensions",
"org.apache.spark.sql.hudi.HoodieSparkSessionExtension")
+ .config("spark.sql.catalog.spark_catalog",
"org.apache.spark.sql.hudi.catalog.HoodieCatalog")
+ .config("spark.kryo.registrator",
"org.apache.spark.HoodieSparkKryoRegistrar")
+ .enableHiveSupport()
.getOrCreate()
-
+ )
+
+ _spark.sparkContext.setLogLevel(log_level)
print(f"SparkSession started with app name: {app_name}, log level:
{log_level}")
-
- return spark_session
-# Initialize Spark globally so other functions can use it
-spark = get_spark_session()
+ return _spark
+
+
+def stop_spark_session():
+ """Stop the global SparkSession and clear the singleton."""
+ global _spark
+ if _spark is not None:
+ _spark.stop()
+ _spark = None
+ print("SparkSession stopped successfully.")
+
-# S3 Utility Function
def ls(base_path):
"""
List files or directories at the given MinIO S3 path.
-
- Example: ls("s3a://warehouse/hudi_table/")
+
+ Args:
+ base_path: Path starting with 's3a://' (e.g.
s3a://warehouse/hudi_table/).
"""
if not base_path.startswith("s3a://"):
raise ValueError("Path must start with 's3a://'")
+
+ global _spark
+ if _spark is None:
+ raise RuntimeError("SparkSession not initialized. Call
get_spark_session() first.")
+
try:
- hadoop_conf = spark._jsc.hadoopConfiguration()
- fs = spark._jvm.org.apache.hadoop.fs.FileSystem.get(hadoop_conf)
- p = spark._jvm.org.apache.hadoop.fs.Path(base_path)
+ hadoop_conf = _spark._jsc.hadoopConfiguration()
+ fs = _spark._jvm.org.apache.hadoop.fs.FileSystem.get(hadoop_conf)
+ p = _spark._jvm.org.apache.hadoop.fs.Path(base_path)
if not fs.exists(p):
print(f"Path does not exist: {base_path}")
return []
@@ -69,63 +144,31 @@ def ls(base_path):
print(f"Exception occurred while listing files from path {base_path}",
e)
-# Display Utility Function
-def display(df, num_rows=100):
+def display(df, num_rows=None):
"""
- Displays a PySpark DataFrame in a formatted HTML table.
-
- This function is designed to mimic the Databricks 'display' function by
- presenting a sample of the DataFrame in a clean, readable table format
- using HTML and Tailwind CSS for styling.
+ Display a PySpark DataFrame as a formatted HTML table (Databricks-style).
Args:
- df (pyspark.sql.DataFrame): The PySpark DataFrame to display.
- num_rows (int): The number of rows to show. Defaults to 100.
+ df (pyspark.sql.DataFrame): The DataFrame to display.
+ num_rows (int): Number of rows to show. Defaults to
DEFAULT_DISPLAY_ROWS (100).
"""
-
- # Collect a limited number of rows to the driver as a Pandas DataFrame
+ if num_rows is None:
+ num_rows = DEFAULT_DISPLAY_ROWS
+
try:
pandas_df = df.limit(num_rows).toPandas()
except Exception as e:
print(f"Error converting DataFrame to Pandas: {e}")
return
- # Use pandas to_html to get a clean table, then add custom styling.
- # The styling uses Tailwind CSS classes for a clean, modern look.
- html_table = pandas_df.to_html(index=False, classes=[
- "w-full", "border-collapse", "text-sm", "text-gray-900",
"dark:text-white"
- ])
-
- # We are adding custom styling here to make it look like a well-formatted
blog post table.
- custom_css = """
- <style>
- .dataframe {
- border-radius: 0.5rem;
- box-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.1), 0 2px 4px -1px
rgba(0, 0, 0, 0.06);
- overflow-x: auto;
- border: 1px solid #e2e8f0;
- }
- .dataframe th {
- background-color: #f1f5f9;
- color: #1f2937;
- font-weight: 600;
- padding: 0.75rem 1.5rem;
- text-align: left;
- border-bottom: 2px solid #e2e8f0;
- }
- .dataframe td {
- padding: 0.75rem 1.5rem;
- border-bottom: 1px solid #e2e8f0;
- }
- .dataframe tr:nth-child(even) {
- background-color: #f8fafc;
- }
- .dataframe tr:hover {
- background-color: #e2e8f0;
- transition: background-color 0.2s ease-in-out;
- }
- </style>
- """
-
- # Display the final HTML
- display_html(HTML(custom_css + html_table))
+ html_table = pandas_df.to_html(
+ index=False,
+ classes=[
+ "w-full",
+ "border-collapse",
+ "text-sm",
+ "text-gray-900",
+ "dark:text-white",
+ ],
+ )
+ display_html(HTML(_DISPLAY_TABLE_CSS + html_table))
diff --git a/hudi-notebooks/requirements.txt b/hudi-notebooks/requirements.txt
new file mode 100644
index 000000000000..ec4c9fb131c7
--- /dev/null
+++ b/hudi-notebooks/requirements.txt
@@ -0,0 +1,6 @@
+jupyter
+pandas
+numpy
+boto3
+trino
+presto-python-client
\ No newline at end of file
diff --git a/hudi-notebooks/run_spark_hudi.sh b/hudi-notebooks/run_spark_hudi.sh
index 618737a3273c..9c8726c6888f 100644
--- a/hudi-notebooks/run_spark_hudi.sh
+++ b/hudi-notebooks/run_spark_hudi.sh
@@ -15,6 +15,8 @@
# See the License for the specific language governing permissions and
# limitations under the License.
+set -euo pipefail
+
state=${1:-"start"}
state=$(echo "$state" | tr '[:upper:]' '[:lower:]')