This is an automated email from the ASF dual-hosted git repository.
morningman pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new 224bca3794 [docker](hudi) add hudi docker compose (#19048)
224bca3794 is described below
commit 224bca379470b3a396d40e680dd40436c9f6d2b0
Author: hechao <[email protected]>
AuthorDate: Tue May 2 09:54:52 2023 +0800
[docker](hudi) add hudi docker compose (#19048)
---
docker/thirdparties/docker-compose/hudi/hadoop.env | 52 ++++
.../thirdparties/docker-compose/hudi/hudi.yaml.tpl | 267 +++++++++++++++++++++
.../hudi/scripts/config/base.properties | 25 ++
.../hudi/scripts/config/dfs-source.properties | 31 +++
.../hudi/scripts/config/hoodie-incr.properties | 34 +++
.../hudi/scripts/config/hoodie-schema.avsc | 146 +++++++++++
.../hudi/scripts/config/kafka-source.properties | 30 +++
.../hudi/scripts/config/log4j2.properties | 61 +++++
.../docker-compose/hudi/scripts/config/schema.avsc | 59 +++++
.../hudi/scripts/config/spark-defaults.conf | 30 +++
.../docker-compose/hudi/scripts/run_sync_tool.sh | 56 +++++
.../hudi/scripts/setup_demo_container_adhoc_1.sh | 31 +++
.../hudi/scripts/setup_demo_container_adhoc_2.sh | 77 ++++++
docker/thirdparties/run-thirdparties-docker.sh | 31 ++-
.../developer-guide/regression-testing.md | 127 +++++++---
15 files changed, 1021 insertions(+), 36 deletions(-)
diff --git a/docker/thirdparties/docker-compose/hudi/hadoop.env
b/docker/thirdparties/docker-compose/hudi/hadoop.env
new file mode 100644
index 0000000000..28ef46c3eb
--- /dev/null
+++ b/docker/thirdparties/docker-compose/hudi/hadoop.env
@@ -0,0 +1,52 @@
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+HIVE_SITE_CONF_javax_jdo_option_ConnectionURL=jdbc:postgresql://hive-metastore-postgresql/metastore
+HIVE_SITE_CONF_javax_jdo_option_ConnectionDriverName=org.postgresql.Driver
+HIVE_SITE_CONF_javax_jdo_option_ConnectionUserName=hive
+HIVE_SITE_CONF_javax_jdo_option_ConnectionPassword=hive
+HIVE_SITE_CONF_datanucleus_autoCreateSchema=false
+HIVE_SITE_CONF_hive_metastore_uris=thrift://hivemetastore:9083
+
+HDFS_CONF_dfs_namenode_datanode_registration_ip___hostname___check=false
+HDFS_CONF_dfs_webhdfs_enabled=true
+HDFS_CONF_dfs_permissions_enabled=false
+#HDFS_CONF_dfs_client_use_datanode_hostname=true
+#HDFS_CONF_dfs_namenode_use_datanode_hostname=true
+HDFS_CONF_dfs_replication=1
+
+CORE_CONF_fs_defaultFS=hdfs://namenode:8020
+CORE_CONF_hadoop_http_staticuser_user=root
+CORE_CONF_hadoop_proxyuser_hue_hosts=*
+CORE_CONF_hadoop_proxyuser_hue_groups=*
+
+YARN_CONF_yarn_log___aggregation___enable=true
+YARN_CONF_yarn_resourcemanager_recovery_enabled=true
+YARN_CONF_yarn_resourcemanager_store_class=org.apache.hadoop.yarn.server.resourcemanager.recovery.FileSystemRMStateStore
+YARN_CONF_yarn_resourcemanager_fs_state___store_uri=/rmstate
+YARN_CONF_yarn_nodemanager_remote___app___log___dir=/app-logs
+YARN_CONF_yarn_log_server_url=http://historyserver:8188/applicationhistory/logs/
+YARN_CONF_yarn_timeline___service_enabled=true
+YARN_CONF_yarn_timeline___service_generic___application___history_enabled=true
+YARN_CONF_yarn_resourcemanager_system___metrics___publisher_enabled=true
+YARN_CONF_yarn_resourcemanager_hostname=resourcemanager
+YARN_CONF_yarn_timeline___service_hostname=historyserver
+YARN_CONF_yarn_resourcemanager_address=resourcemanager:8032
+YARN_CONF_yarn_resourcemanager_scheduler_address=resourcemanager:8030
+YARN_CONF_yarn_resourcemanager_resource___tracker_address=resourcemanager:8031
+YARN_CONF_yarn_nodemanager_vmem___check___enabled=false
diff --git a/docker/thirdparties/docker-compose/hudi/hudi.yaml.tpl
b/docker/thirdparties/docker-compose/hudi/hudi.yaml.tpl
new file mode 100644
index 0000000000..f0878e452b
--- /dev/null
+++ b/docker/thirdparties/docker-compose/hudi/hudi.yaml.tpl
@@ -0,0 +1,267 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+version: "3.3"
+
+networks:
+ doris--hudi:
+ driver: bridge
+
+services:
+
+ namenode:
+ image: apachehudi/hudi-hadoop_2.8.4-namenode:latest
+ hostname: namenode
+ container_name: namenode
+ environment:
+ - CLUSTER_NAME=hudi_hadoop284_hive232_spark244
+ ports:
+ - "50070:50070"
+ - "8020:8020"
+ # JVM debugging port (will be mapped to a random port on host)
+ - "5005"
+ env_file:
+ - ./hadoop.env
+ healthcheck:
+ test: ["CMD", "curl", "-f", "http://namenode:50070"]
+ interval: 30s
+ timeout: 10s
+ retries: 3
+ networks:
+ - doris--hudi
+
+ datanode1:
+ image: apachehudi/hudi-hadoop_2.8.4-datanode:latest
+ container_name: datanode1
+ hostname: datanode1
+ environment:
+ - CLUSTER_NAME=hudi_hadoop284_hive232_spark244
+ env_file:
+ - ./hadoop.env
+ ports:
+ - "50075:50075"
+ - "50010:50010"
+ # JVM debugging port (will be mapped to a random port on host)
+ - "5005"
+ links:
+ - "namenode"
+ - "historyserver"
+ healthcheck:
+ test: ["CMD", "curl", "-f", "http://datanode1:50075"]
+ interval: 30s
+ timeout: 10s
+ retries: 3
+ depends_on:
+ - namenode
+ networks:
+ - doris--hudi
+
+ historyserver:
+ image: apachehudi/hudi-hadoop_2.8.4-history:latest
+ hostname: historyserver
+ container_name: historyserver
+ environment:
+ - CLUSTER_NAME=hudi_hadoop284_hive232_spark244
+ depends_on:
+ - "namenode"
+ links:
+ - "namenode"
+ ports:
+ - "58188:8188"
+ healthcheck:
+ test: ["CMD", "curl", "-f", "http://historyserver:8188"]
+ interval: 30s
+ timeout: 10s
+ retries: 3
+ env_file:
+ - ./hadoop.env
+ volumes:
+ - ./historyserver:/hadoop/yarn/timeline
+ networks:
+ - doris--hudi
+
+ hive-metastore-postgresql:
+ image: bde2020/hive-metastore-postgresql:2.3.0
+ volumes:
+ - ./hive-metastore-postgresql:/var/lib/postgresql
+ hostname: hive-metastore-postgresql
+ container_name: hive-metastore-postgresql
+ networks:
+ - doris--hudi
+
+ hivemetastore:
+ image: apachehudi/hudi-hadoop_2.8.4-hive_2.3.3:latest
+ hostname: hivemetastore
+ container_name: hivemetastore
+ links:
+ - "hive-metastore-postgresql"
+ - "namenode"
+ env_file:
+ - ./hadoop.env
+ command: /opt/hive/bin/hive --service metastore
+ environment:
+ SERVICE_PRECONDITION: "namenode:50070 hive-metastore-postgresql:5432"
+ ports:
+ - "9083:9083"
+ # JVM debugging port (will be mapped to a random port on host)
+ - "5005"
+ healthcheck:
+ test: ["CMD", "nc", "-z", "hivemetastore", "9083"]
+ interval: 30s
+ timeout: 10s
+ retries: 3
+ depends_on:
+ - "hive-metastore-postgresql"
+ - "namenode"
+ networks:
+ - doris--hudi
+
+ hiveserver:
+ image: apachehudi/hudi-hadoop_2.8.4-hive_2.3.3:latest
+ hostname: hiveserver
+ container_name: hiveserver
+ env_file:
+ - ./hadoop.env
+ environment:
+ SERVICE_PRECONDITION: "hivemetastore:9083"
+ ports:
+ - "10000:10000"
+ # JVM debugging port (will be mapped to a random port on host)
+ - "5005"
+ depends_on:
+ - "hivemetastore"
+ links:
+ - "hivemetastore"
+ - "hive-metastore-postgresql"
+ - "namenode"
+ volumes:
+ - ./scripts:/var/scripts
+ networks:
+ - doris--hudi
+
+ sparkmaster:
+ image: apachehudi/hudi-hadoop_2.8.4-hive_2.3.3-sparkmaster_2.4.4:latest
+ hostname: sparkmaster
+ container_name: sparkmaster
+ env_file:
+ - ./hadoop.env
+ ports:
+ - "8080:8080"
+ - "7077:7077"
+ # JVM debugging port (will be mapped to a random port on host)
+ - "5005"
+ environment:
+ - INIT_DAEMON_STEP=setup_spark
+ links:
+ - "hivemetastore"
+ - "hiveserver"
+ - "hive-metastore-postgresql"
+ - "namenode"
+ networks:
+ - doris--hudi
+
+ spark-worker-1:
+ image: apachehudi/hudi-hadoop_2.8.4-hive_2.3.3-sparkworker_2.4.4:latest
+ hostname: spark-worker-1
+ container_name: spark-worker-1
+ env_file:
+ - ./hadoop.env
+ depends_on:
+ - sparkmaster
+ ports:
+ - "8081:8081"
+ # JVM debugging port (will be mapped to a random port on host)
+ - "5005"
+ environment:
+ - "SPARK_MASTER=spark://sparkmaster:7077"
+ links:
+ - "hivemetastore"
+ - "hiveserver"
+ - "hive-metastore-postgresql"
+ - "namenode"
+ networks:
+ - doris--hudi
+
+# zookeeper:
+# image: 'bitnami/zookeeper:3.4.12-r68'
+# hostname: zookeeper
+# container_name: zookeeper
+# ports:
+# - "2181:2181"
+# environment:
+# - ALLOW_ANONYMOUS_LOGIN=yes
+# networks:
+# - doris--hudi
+
+# kafka:
+# image: 'bitnami/kafka:2.0.0'
+# hostname: kafkabroker
+# container_name: kafkabroker
+# ports:
+# - "9092:9092"
+# environment:
+# - KAFKA_ZOOKEEPER_CONNECT=zookeeper:2181
+# - ALLOW_PLAINTEXT_LISTENER=yes
+# networks:
+# - doris--hudi
+
+ adhoc-1:
+ image: apachehudi/hudi-hadoop_2.8.4-hive_2.3.3-sparkadhoc_2.4.4:latest
+ hostname: adhoc-1
+ container_name: adhoc-1
+ env_file:
+ - ./hadoop.env
+ depends_on:
+ - sparkmaster
+ ports:
+ - '4040:4040'
+ # JVM debugging port (mapped to 5006 on the host)
+ - "5006:5005"
+ environment:
+ - "SPARK_MASTER=spark://sparkmaster:7077"
+ links:
+ - "hivemetastore"
+ - "hiveserver"
+ - "hive-metastore-postgresql"
+ - "namenode"
+ volumes:
+ - ./scripts:/var/scripts
+ networks:
+ - doris--hudi
+
+ adhoc-2:
+ image: apachehudi/hudi-hadoop_2.8.4-hive_2.3.3-sparkadhoc_2.4.4:latest
+ hostname: adhoc-2
+ container_name: adhoc-2
+ env_file:
+ - ./hadoop.env
+ ports:
+ # JVM debugging port (mapped to 5005 on the host)
+ - "5005:5005"
+ depends_on:
+ - sparkmaster
+ environment:
+ - "SPARK_MASTER=spark://sparkmaster:7077"
+ links:
+ - "hivemetastore"
+ - "hiveserver"
+ - "hive-metastore-postgresql"
+ - "namenode"
+ volumes:
+ - ./scripts:/var/scripts
+ networks:
+ - doris--hudi
diff --git
a/docker/thirdparties/docker-compose/hudi/scripts/config/base.properties
b/docker/thirdparties/docker-compose/hudi/scripts/config/base.properties
new file mode 100644
index 0000000000..0666245758
--- /dev/null
+++ b/docker/thirdparties/docker-compose/hudi/scripts/config/base.properties
@@ -0,0 +1,25 @@
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+hoodie.upsert.shuffle.parallelism=2
+hoodie.insert.shuffle.parallelism=2
+hoodie.delete.shuffle.parallelism=2
+hoodie.bulkinsert.shuffle.parallelism=2
+hoodie.embed.timeline.server=true
+hoodie.filesystem.view.type=EMBEDDED_KV_STORE
+hoodie.compact.inline=false
diff --git
a/docker/thirdparties/docker-compose/hudi/scripts/config/dfs-source.properties
b/docker/thirdparties/docker-compose/hudi/scripts/config/dfs-source.properties
new file mode 100644
index 0000000000..04c16e272a
--- /dev/null
+++
b/docker/thirdparties/docker-compose/hudi/scripts/config/dfs-source.properties
@@ -0,0 +1,31 @@
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+include=base.properties
+# Key fields, for kafka example
+hoodie.datasource.write.recordkey.field=key
+hoodie.datasource.write.partitionpath.field=date
+# NOTE: We have to duplicate configuration since this is being used
+# w/ both Spark and DeltaStreamer
+hoodie.table.recordkey.fields=key
+hoodie.table.partition.fields=date
+# Schema provider props (change to absolute path based on your installation)
+hoodie.deltastreamer.schemaprovider.source.schema.file=/var/demo/config/schema.avsc
+hoodie.deltastreamer.schemaprovider.target.schema.file=/var/demo/config/schema.avsc
+# DFS Source
+hoodie.deltastreamer.source.dfs.root=/usr/hive/data/input/
diff --git
a/docker/thirdparties/docker-compose/hudi/scripts/config/hoodie-incr.properties
b/docker/thirdparties/docker-compose/hudi/scripts/config/hoodie-incr.properties
new file mode 100644
index 0000000000..c796063ff1
--- /dev/null
+++
b/docker/thirdparties/docker-compose/hudi/scripts/config/hoodie-incr.properties
@@ -0,0 +1,34 @@
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+hoodie.upsert.shuffle.parallelism=2
+hoodie.insert.shuffle.parallelism=2
+hoodie.delete.shuffle.parallelism=2
+hoodie.bulkinsert.shuffle.parallelism=2
+hoodie.datasource.write.recordkey.field=_row_key
+hoodie.datasource.write.partitionpath.field=partition
+hoodie.deltastreamer.schemaprovider.source.schema.file=file:///var/hoodie/ws/docker/demo/config/hoodie-schema.avsc
+hoodie.deltastreamer.schemaprovider.target.schema.file=file:///var/hoodie/ws/docker/demo/config/hoodie-schema.avsc
+hoodie.deltastreamer.source.hoodieincr.partition.fields=partition
+hoodie.deltastreamer.source.hoodieincr.path=/docker_hoodie_sync_valid_test
+hoodie.deltastreamer.source.hoodieincr.read_latest_on_missing_ckpt=true
+# hive sync
+hoodie.datasource.hive_sync.table=docker_hoodie_sync_valid_test_2
+hoodie.datasource.hive_sync.jdbcurl=jdbc:hive2://hiveserver:10000/
+hoodie.datasource.hive_sync.partition_fields=partition
+hoodie.datasource.hive_sync.partition_extractor_class=org.apache.hudi.hive.SlashEncodedDayPartitionValueExtractor
diff --git
a/docker/thirdparties/docker-compose/hudi/scripts/config/hoodie-schema.avsc
b/docker/thirdparties/docker-compose/hudi/scripts/config/hoodie-schema.avsc
new file mode 100644
index 0000000000..f97742c947
--- /dev/null
+++ b/docker/thirdparties/docker-compose/hudi/scripts/config/hoodie-schema.avsc
@@ -0,0 +1,146 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+{
+ "type": "record",
+ "name": "triprec",
+ "fields": [
+ {
+ "name": "timestamp",
+ "type": "double"
+ },
+ {
+ "name": "_row_key",
+ "type": "string"
+ },
+ {
+ "name": "rider",
+ "type": "string"
+ },
+ {
+ "name": "driver",
+ "type": "string"
+ },
+ {
+ "name": "begin_lat",
+ "type": "double"
+ },
+ {
+ "name": "begin_lon",
+ "type": "double"
+ },
+ {
+ "name": "end_lat",
+ "type": "double"
+ },
+ {
+ "name": "end_lon",
+ "type": "double"
+ },
+ {
+ "name": "distance_in_meters",
+ "type": "int"
+ },
+ {
+ "name": "seconds_since_epoch",
+ "type": "long"
+ },
+ {
+ "name": "weight",
+ "type": "float"
+ },
+ {
+ "name": "nation",
+ "type": "bytes"
+ },
+ {
+ "name": "current_date",
+ "type": {
+ "type": "int",
+ "logicalType": "date"
+ }
+ },
+ {
+ "name": "current_ts",
+ "type": {
+ "type": "long",
+ "logicalType": "timestamp-micros"
+ }
+ },
+ {
+ "name": "height",
+ "type": {
+ "type": "fixed",
+ "name": "abc",
+ "size": 5,
+ "logicalType": "decimal",
+ "precision": 10,
+ "scale": 6
+ }
+ },
+ {
+ "name": "city_to_state",
+ "type": {
+ "type": "map",
+ "values": "string"
+ }
+ },
+ {
+ "name": "fare",
+ "type": {
+ "type": "record",
+ "name": "fare",
+ "fields": [
+ {
+ "name": "amount",
+ "type": "double"
+ },
+ {
+ "name": "currency",
+ "type": "string"
+ }
+ ]
+ }
+ },
+ {
+ "name": "tip_history",
+ "type": {
+ "type": "array",
+ "items": {
+ "type": "record",
+ "name": "tip_history",
+ "fields": [
+ {
+ "name": "amount",
+ "type": "double"
+ },
+ {
+ "name": "currency",
+ "type": "string"
+ }
+ ]
+ }
+ }
+ },
+ {
+ "name": "_hoodie_is_deleted",
+ "type": "boolean",
+ "default": false
+ }
+ ]
+}
diff --git
a/docker/thirdparties/docker-compose/hudi/scripts/config/kafka-source.properties
b/docker/thirdparties/docker-compose/hudi/scripts/config/kafka-source.properties
new file mode 100644
index 0000000000..5ba5290ca6
--- /dev/null
+++
b/docker/thirdparties/docker-compose/hudi/scripts/config/kafka-source.properties
@@ -0,0 +1,30 @@
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+include=base.properties
+# Key fields, for kafka example
+hoodie.datasource.write.recordkey.field=key
+hoodie.datasource.write.partitionpath.field=date
+# Schema provider props (change to absolute path based on your installation)
+hoodie.deltastreamer.schemaprovider.source.schema.file=/var/demo/config/schema.avsc
+hoodie.deltastreamer.schemaprovider.target.schema.file=/var/demo/config/schema.avsc
+# Kafka Source
+hoodie.deltastreamer.source.kafka.topic=stock_ticks
+#Kafka props
+bootstrap.servers=kafkabroker:9092
+auto.offset.reset=earliest
diff --git
a/docker/thirdparties/docker-compose/hudi/scripts/config/log4j2.properties
b/docker/thirdparties/docker-compose/hudi/scripts/config/log4j2.properties
new file mode 100644
index 0000000000..86450ead3e
--- /dev/null
+++ b/docker/thirdparties/docker-compose/hudi/scripts/config/log4j2.properties
@@ -0,0 +1,61 @@
+###
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+###
+status = warn
+name = HudiConsoleLog
+
+# Set everything to be logged to the console
+appender.console.type = Console
+appender.console.name = CONSOLE
+appender.console.layout.type = PatternLayout
+appender.console.layout.pattern = %d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n
+
+# Root logger level
+rootLogger.level = warn
+# Root logger referring to console appender
+rootLogger.appenderRef.stdout.ref = CONSOLE
+
+# Set the default spark-shell log level to WARN. When running the spark-shell,
the
+# log level for this class is used to overwrite the root logger's log level,
so that
+# the user can have different defaults for the shell and regular Spark apps.
+logger.apache_spark_repl.name = org.apache.spark.repl.Main
+logger.apache_spark_repl.level = warn
+# Set logging of integration testsuite to INFO level
+logger.hudi_integ.name = org.apache.hudi.integ.testsuite
+logger.hudi_integ.level = info
+# Settings to quiet third party logs that are too verbose
+logger.apache_spark_jetty.name = org.spark_project.jetty
+logger.apache_spark_jetty.level = warn
+logger.apache_spark_jett_lifecycle.name =
org.spark_project.jetty.util.component.AbstractLifeCycle
+logger.apache_spark_jett_lifecycle.level = error
+logger.apache_spark_repl_imain.name =
org.apache.spark.repl.SparkIMain$exprTyper
+logger.apache_spark_repl_imain.level = info
+logger.apache_spark_repl_iloop.name =
org.apache.spark.repl.SparkILoop$SparkILoopInterpreter
+logger.apache_spark_repl_iloop.level = info
+logger.parquet.name = org.apache.parquet
+logger.parquet.level = error
+logger.spark.name = org.apache.spark
+logger.spark.level = warn
+# Disabling Jetty logs
+logger.jetty.name = org.apache.hudi.org.eclipse.jetty
+logger.jetty.level = error
+# SPARK-9183: Settings to avoid annoying messages when looking up nonexistent
UDFs in SparkSQL with Hive support
+logger.hive_handler.name = org.apache.hadoop.hive.metastore.RetryingHMSHandler
+logger.hive_handler.level = fatal
+logger.hive_func_registry.name =
org.apache.hadoop.hive.ql.exec.FunctionRegistry
+logger.hive_func_registry.level = error
diff --git a/docker/thirdparties/docker-compose/hudi/scripts/config/schema.avsc
b/docker/thirdparties/docker-compose/hudi/scripts/config/schema.avsc
new file mode 100644
index 0000000000..aa8baaf44b
--- /dev/null
+++ b/docker/thirdparties/docker-compose/hudi/scripts/config/schema.avsc
@@ -0,0 +1,59 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+{
+ "type":"record",
+ "name":"stock_ticks",
+ "fields":[{
+ "name": "volume",
+ "type": "long"
+ }, {
+ "name": "ts",
+ "type": "string"
+ }, {
+ "name": "symbol",
+ "type": "string"
+ },{
+ "name": "year",
+ "type": "int"
+ },{
+ "name": "month",
+ "type": "string"
+ },{
+ "name": "high",
+ "type": "double"
+ },{
+ "name": "low",
+ "type": "double"
+ },{
+ "name": "key",
+ "type": "string"
+ },{
+ "name": "date",
+ "type":"string"
+ }, {
+ "name": "close",
+ "type": "double"
+ }, {
+ "name": "open",
+ "type": "double"
+ }, {
+ "name": "day",
+ "type":"string"
+ }
+]}
diff --git
a/docker/thirdparties/docker-compose/hudi/scripts/config/spark-defaults.conf
b/docker/thirdparties/docker-compose/hudi/scripts/config/spark-defaults.conf
new file mode 100644
index 0000000000..d085bfe588
--- /dev/null
+++ b/docker/thirdparties/docker-compose/hudi/scripts/config/spark-defaults.conf
@@ -0,0 +1,30 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+# Default system properties included when running spark-submit.
+# This is useful for setting default environmental settings.
+
+# Example:
+spark.master local[3]
+spark.eventLog.dir hdfs://namenode:8020/tmp/spark-events
+spark.serializer org.apache.spark.serializer.KryoSerializer
+spark.kryo.registrator org.apache.spark.HoodieSparkKryoRegistrar
+
+#spark.executor.memory 4g
+# spark.executor.extraJavaOptions -XX:+PrintGCDetails -Dkey=value
-Dnumbers="one two three"
diff --git a/docker/thirdparties/docker-compose/hudi/scripts/run_sync_tool.sh
b/docker/thirdparties/docker-compose/hudi/scripts/run_sync_tool.sh
new file mode 100755
index 0000000000..390d09f967
--- /dev/null
+++ b/docker/thirdparties/docker-compose/hudi/scripts/run_sync_tool.sh
@@ -0,0 +1,56 @@
+#!/usr/bin/env bash
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+function error_exit {
+ echo "$1" >&2 ## Send message to stderr. Exclude >&2 if you don't want
it that way.
+ exit "${2:-1}" ## Return a code specified by $2 or 1 by default.
+}
+
+if [ -z "${HADOOP_HOME}" ]; then
+ error_exit "Please make sure the environment variable HADOOP_HOME is setup"
+fi
+
+if [ -z "${HIVE_HOME}" ]; then
+ error_exit "Please make sure the environment variable HIVE_HOME is setup"
+fi
+
+DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
+#Ensure we pick the right jar even for hive11 builds
+HUDI_HIVE_UBER_JAR=`ls -c
$DIR/./hudi_docker_compose_attached_file/jar/hoodie-hive-sync-bundle.jar | grep
-v source | head -1`
+
+if [ -z "$HADOOP_CONF_DIR" ]; then
+ echo "setting hadoop conf dir"
+ HADOOP_CONF_DIR="${HADOOP_HOME}/etc/hadoop"
+fi
+
+## Include only specific packages from HIVE_HOME/lib to avoid version
mismatches
+HIVE_EXEC=`ls ${HIVE_HOME}/lib/hive-exec-*.jar | tr '\n' ':'`
+HIVE_SERVICE=`ls ${HIVE_HOME}/lib/hive-service-*.jar | grep -v rpc | tr '\n'
':'`
+HIVE_METASTORE=`ls ${HIVE_HOME}/lib/hive-metastore-*.jar | tr '\n' ':'`
+HIVE_JDBC=`ls ${HIVE_HOME}/lib/hive-jdbc-*.jar | tr '\n' ':'`
+if [ -z "${HIVE_JDBC}" ]; then
+ HIVE_JDBC=`ls ${HIVE_HOME}/lib/hive-jdbc-*.jar | grep -v handler | tr '\n'
':'`
+fi
+HIVE_JACKSON=`ls ${HIVE_HOME}/lib/jackson-*.jar | tr '\n' ':'`
+HIVE_JARS=$HIVE_METASTORE:$HIVE_SERVICE:$HIVE_EXEC:$HIVE_JDBC:$HIVE_JACKSON
+
+HADOOP_HIVE_JARS=${HIVE_JARS}:${HADOOP_HOME}/share/hadoop/common/*:${HADOOP_HOME}/share/hadoop/mapreduce/*:${HADOOP_HOME}/share/hadoop/hdfs/*:${HADOOP_HOME}/share/hadoop/common/lib/*:${HADOOP_HOME}/share/hadoop/hdfs/lib/*
+
+echo "Running Command : java -cp
${HADOOP_HIVE_JARS}:${HADOOP_CONF_DIR}:$HUDI_HIVE_UBER_JAR
org.apache.hudi.hive.HiveSyncTool $@"
+java -cp $HUDI_HIVE_UBER_JAR:${HADOOP_HIVE_JARS}:${HADOOP_CONF_DIR}
org.apache.hudi.hive.HiveSyncTool "$@"
diff --git
a/docker/thirdparties/docker-compose/hudi/scripts/setup_demo_container_adhoc_1.sh
b/docker/thirdparties/docker-compose/hudi/scripts/setup_demo_container_adhoc_1.sh
new file mode 100755
index 0000000000..a5edb7676a
--- /dev/null
+++
b/docker/thirdparties/docker-compose/hudi/scripts/setup_demo_container_adhoc_1.sh
@@ -0,0 +1,31 @@
+#!/bin/bash
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+echo "Copying spark default config and setting up configs"
+cp /var/scripts/config/spark-defaults.conf $SPARK_CONF_DIR/.
+cp /var/scripts/config/log4j2.properties $SPARK_CONF_DIR/.
+echo "sleep 10, wait hdfs start"
+sleep 10
+echo "hadoop fs -mkdir -p /var/demo/"
+hadoop fs -mkdir -p /var/demo/
+echo "hadoop fs -mkdir -p /tmp/spark-events"
+hadoop fs -mkdir -p /tmp/spark-events
+echo "hadoop fs -copyFromLocal -f /var/scripts/config /var/demo/."
+hadoop fs -copyFromLocal -f /var/scripts/config /var/demo/.
+echo "chmod +x /var/scripts/run_sync_tool.sh"
+chmod +x /var/scripts/run_sync_tool.sh
diff --git
a/docker/thirdparties/docker-compose/hudi/scripts/setup_demo_container_adhoc_2.sh
b/docker/thirdparties/docker-compose/hudi/scripts/setup_demo_container_adhoc_2.sh
new file mode 100755
index 0000000000..a55dddd86d
--- /dev/null
+++
b/docker/thirdparties/docker-compose/hudi/scripts/setup_demo_container_adhoc_2.sh
@@ -0,0 +1,77 @@
+#!/bin/bash
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+echo "Copying spark default config and setting up configs"
+cp /var/scripts/config/spark-defaults.conf $SPARK_CONF_DIR/.
+cp /var/scripts/config/log4j2.properties $SPARK_CONF_DIR/.
+echo "sleep 10, wait hdfs start"
+sleep 10
+echo "hadoop fs -mkdir -p /var/demo/"
+hadoop fs -mkdir -p /var/demo/
+echo "hadoop fs -mkdir -p /tmp/spark-events"
+hadoop fs -mkdir -p /tmp/spark-events
+echo "hadoop fs -mkdir -p /user/hive/"
+hadoop fs -mkdir -p /user/hive/
+echo "hadoop fs -copyFromLocal -f /var/scripts/config /var/demo/."
+hadoop fs -copyFromLocal -f /var/scripts/config /var/demo/.
+echo "hadoop fs -copyFromLocal -f
/var/scripts/hudi_docker_compose_attached_file/warehouse /user/hive/"
+hadoop fs -copyFromLocal -f
/var/scripts/hudi_docker_compose_attached_file/warehouse /user/hive/
+echo "chmod +x /var/scripts/run_sync_tool.sh"
+chmod +x /var/scripts/run_sync_tool.sh
+
+echo "Start synchronizing the stock_ticks_cow table"
+/var/scripts/run_sync_tool.sh \
+ --jdbc-url jdbc:hive2://hiveserver:10000 \
+ --user hive \
+ --pass hive \
+ --partitioned-by date \
+ --base-path /user/hive/warehouse/stock_ticks_cow \
+ --database default \
+ --table stock_ticks_cow \
+ --partition-value-extractor
org.apache.hudi.hive.SlashEncodedDayPartitionValueExtractor
+
+echo "Start synchronizing the stock_ticks_mor table"
+/var/scripts/run_sync_tool.sh \
+ --jdbc-url jdbc:hive2://hiveserver:10000 \
+ --user hive \
+ --pass hive \
+ --partitioned-by date \
+ --base-path /user/hive/warehouse/stock_ticks_mor \
+ --database default \
+ --table stock_ticks_mor \
+ --partition-value-extractor
org.apache.hudi.hive.SlashEncodedDayPartitionValueExtractor
+
+echo "Start synchronizing the hudi_cow_pt_tbl table"
+/var/scripts/run_sync_tool.sh \
+ --jdbc-url jdbc:hive2://hiveserver:10000 \
+ --user hive \
+ --pass hive \
+ --partitioned-by dt \
+ --base-path /user/hive/warehouse/hudi_cow_pt_tbl \
+ --database default \
+ --table hudi_cow_pt_tbl \
+ --partition-value-extractor
org.apache.hudi.hive.HiveStylePartitionValueExtractor
+
+echo "Start synchronizing the hudi_non_part_cow table"
+/var/scripts/run_sync_tool.sh \
+ --jdbc-url jdbc:hive2://hiveserver:10000 \
+ --user hive \
+ --pass hive \
+ --base-path /user/hive/warehouse/hudi_non_part_cow \
+ --database default \
+ --table hudi_non_part_cow \
diff --git a/docker/thirdparties/run-thirdparties-docker.sh
b/docker/thirdparties/run-thirdparties-docker.sh
index 1851b03051..283ed7b35b 100755
--- a/docker/thirdparties/run-thirdparties-docker.sh
+++ b/docker/thirdparties/run-thirdparties-docker.sh
@@ -37,7 +37,7 @@ Usage: $0 <options>
--stop stop the specified components
All valid components:
- mysql,pg,oracle,sqlserver,clickhouse,es,hive,iceberg
+ mysql,pg,oracle,sqlserver,clickhouse,es,hive,iceberg,hudi
"
exit 1
}
@@ -60,7 +60,7 @@ STOP=0
if [[ "$#" == 1 ]]; then
# default
- COMPONENTS="mysql,pg,oracle,sqlserver,clickhouse,hive,iceberg"
+ COMPONENTS="mysql,pg,oracle,sqlserver,clickhouse,hive,iceberg,hudi"
else
while true; do
case "$1" in
@@ -92,7 +92,7 @@ else
done
if [[ "${COMPONENTS}"x == ""x ]]; then
if [[ "${STOP}" -eq 1 ]]; then
- COMPONENTS="mysql,pg,oracle,sqlserver,clickhouse,hive,iceberg"
+ COMPONENTS="mysql,pg,oracle,sqlserver,clickhouse,hive,iceberg,hudi"
fi
fi
fi
@@ -128,6 +128,7 @@ RUN_CLICKHOUSE=0
RUN_HIVE=0
RUN_ES=0
RUN_ICEBERG=0
+RUN_HUDI=0
for element in "${COMPONENTS_ARR[@]}"; do
if [[ "${element}"x == "mysql"x ]]; then
RUN_MYSQL=1
@@ -145,6 +146,8 @@ for element in "${COMPONENTS_ARR[@]}"; do
RUN_HIVE=1
elif [[ "${element}"x == "iceberg"x ]]; then
RUN_ICEBERG=1
+ elif [[ "${element}"x == "hudi"x ]]; then
+ RUN_HUDI=1
else
echo "Invalid component: ${element}"
usage
@@ -265,3 +268,25 @@ if [[ "${RUN_ICEBERG}" -eq 1 ]]; then
sudo docker compose -f "${ROOT}"/docker-compose/iceberg/iceberg.yaml
--env-file "${ROOT}"/docker-compose/iceberg/iceberg.env up -d
fi
fi
+
+if [[ "${RUN_HUDI}" -eq 1 ]]; then
+ # hudi
+ cp "${ROOT}"/docker-compose/hudi/hudi.yaml.tpl
"${ROOT}"/docker-compose/hudi/hudi.yaml
+ sed -i "s/doris--/${CONTAINER_UID}/g"
"${ROOT}"/docker-compose/hudi/hudi.yaml
+ sudo docker compose -f "${ROOT}"/docker-compose/hudi/hudi.yaml --env-file
"${ROOT}"/docker-compose/hudi/hadoop.env down
+ if [[ "${STOP}" -ne 1 ]]; then
+ sudo rm -rf "${ROOT}"/docker-compose/hudi/historyserver
+ sudo mkdir "${ROOT}"/docker-compose/hudi/historyserver
+ sudo rm -rf "${ROOT}"/docker-compose/hudi/hive-metastore-postgresql
+ sudo mkdir "${ROOT}"/docker-compose/hudi/hive-metastore-postgresql
+ if [[ ! -d
"${ROOT}/docker-compose/hudi/scripts/hudi_docker_compose_attached_file" ]]; then
+ echo "Attached files does not exist, please download the
https://doris-build-hk-1308700295.cos.ap-hongkong.myqcloud.com/regression/load/hudi/hudi_docker_compose_attached_file.zip
file to the docker-compose/hudi/scripts/ directory and unzip it."
+ exit 1
+ fi
+ sudo docker compose -f "${ROOT}"/docker-compose/hudi/hudi.yaml
--env-file "${ROOT}"/docker-compose/hudi/hadoop.env up -d
+ echo "sleep 15, wait server start"
+ sleep 15
+ docker exec -it adhoc-1 /bin/bash
/var/scripts/setup_demo_container_adhoc_1.sh
+ docker exec -it adhoc-2 /bin/bash
/var/scripts/setup_demo_container_adhoc_2.sh
+ fi
+fi
diff --git a/docs/zh-CN/community/developer-guide/regression-testing.md
b/docs/zh-CN/community/developer-guide/regression-testing.md
index 48c6de8f7c..3617b4d769 100644
--- a/docs/zh-CN/community/developer-guide/regression-testing.md
+++ b/docs/zh-CN/community/developer-guide/regression-testing.md
@@ -605,10 +605,10 @@ Doris 支持一些外部署数据源的查询。所以回归框架也提供了
1. 启动 Container
- Doris 目前支持 es, mysql, pg, hive, sqlserver, oracle, iceberg 等数据源的 Docker
compose。相关文件存放在 `docker/thirdparties/docker-compose` 目录下。
+ Doris 目前支持 es, mysql, pg, hive, sqlserver, oracle, iceberg, hudi 等数据源的
Docker compose。相关文件存放在 `docker/thirdparties/docker-compose` 目录下。
默认情况下,可以直接通过以下命令启动所有外部数据源的 Docker container:
- (注意,hive container 需要下载预制的数据文件,请参阅下面 hive 相关的文档。)
+ (注意,hive和hudi container 需要下载预制的数据文件,请参阅下面 hive和hudi 相关的文档。)
```
cd docker/thirdparties && sh run-thirdparties-docker.sh
@@ -692,48 +692,109 @@ Doris 支持一些外部署数据源的查询。所以回归框架也提供了
* `clickhouse.yaml.tpl`:Docker compose 文件模板。无需修改。
* `clickhouse.env`:配置 ClickHouse 对外端口,默认为 8123。
- 8. Iceberg
+ 8. Iceberg
- 提供 Iceberg + Spark + Minio 镜像组合。存放在
docker/thirdparties/docker-compose/iceberg/ 下。
+ 提供 Iceberg + Spark + Minio 镜像组合。存放在
docker/thirdparties/docker-compose/iceberg/ 下。
- * `iceberg.yaml.tpl`:Docker compose 文件模板。无需修改。
- * `entrypoint.sh.tpl`:镜像启动后的初始化脚本模板。无需修改。
- * `spark-defaults.conf.tpl`:Spark 的配置文件模板。无需修改。
- * `iceberg.env`:对外端口配置文件,需修改各个对外端口,避免端口冲突。
+ * `iceberg.yaml.tpl`:Docker compose 文件模板。无需修改。
+ * `entrypoint.sh.tpl`:镜像启动后的初始化脚本模板。无需修改。
+ * `spark-defaults.conf.tpl`:Spark 的配置文件模板。无需修改。
+ * `iceberg.env`:对外端口配置文件,需修改各个对外端口,避免端口冲突。
- 启动后,可以通过如下命令启动 spark-sql
+ 启动后,可以通过如下命令启动 spark-sql
- `docker exec -it doris-xx-spark-iceberg spark-sql`
+ `docker exec -it doris-xx-spark-iceberg spark-sql`
- 其中 `doris-xx-spark-iceberg` 为 container 名称。
+ 其中 `doris-xx-spark-iceberg` 为 container 名称。
- spark-sql iceberg 操作示例:
+ spark-sql iceberg 操作示例:
- ```
- create database db1;
- show databases;
- create table db1.test1(k1 bigint, k2 bigint, k3 string) partitioned by
(k1);
- insert into db1.test1 values(1,2,'abc');
- select * from db1.test1;
- quit;
- ```
+ ```
+ create database db1;
+ show databases;
+ create table db1.test1(k1 bigint, k2 bigint, k3 string) partitioned by
(k1);
+ insert into db1.test1 values(1,2,'abc');
+ select * from db1.test1;
+ quit;
+ ```
- 也可以通过 spark-shell 进行访问:
+ 也可以通过 spark-shell 进行访问:
- ```
- docker exec -it doris-xx-spark-iceberg spark-shell
-
- spark.sql(s"create database db1")
- spark.sql(s"show databases").show()
- spark.sql(s"create table db1.test1(k1 bigint, k2 bigint, k3 string)
partitioned by (k1)").show()
- spark.sql(s"show tables from db1").show()
- spark.sql(s"insert into db1.test1 values(1,2,'abc')").show()
- spark.sql(s"select * from db1.test1").show()
- :q
- ```
+ ```
+ docker exec -it doris-xx-spark-iceberg spark-shell
+
+ spark.sql(s"create database db1")
+ spark.sql(s"show databases").show()
+ spark.sql(s"create table db1.test1(k1 bigint, k2 bigint, k3 string)
partitioned by (k1)").show()
+ spark.sql(s"show tables from db1").show()
+ spark.sql(s"insert into db1.test1 values(1,2,'abc')").show()
+ spark.sql(s"select * from db1.test1").show()
+ :q
+ ```
+
+ 更多使用方式可参阅 [Tabular
官方文档](https://tabular.io/blog/docker-spark-and-iceberg/)。
+ 9. Hudi
- 更多使用方式可参阅 [Tabular
官方文档](https://tabular.io/blog/docker-spark-and-iceberg/)。
+ Hudi 相关的 Docker compose 文件存放在 docker/thirdparties/docker-compose/hudi 下。
+ * `hudi.yaml.tpl`:Docker compose 文件模板,无需修改。
+ * `hadoop.env`:配置文件的模板,无需修改。
+ * `scripts/` 目录会在 container 启动后挂载到 container 中。其中的文件内容无需修改。但须注意,在启动
container 之前,需要先下载预制文件:
+ 将
`https://doris-build-hk-1308700295.cos.ap-hongkong.myqcloud.com/regression/load/hudi/hudi_docker_compose_attached_file.zip`
文件下载到 `scripts/` 目录并解压即可。
+
+ *
+ 启动前,可以将以下设置添加到`/etc/hosts`中,以避免出现`UnknownHostException`错误
+ ```
+ 127.0.0.1 adhoc-1
+ 127.0.0.1 adhoc-2
+ 127.0.0.1 namenode
+ 127.0.0.1 datanode1
+ 127.0.0.1 hiveserver
+ 127.0.0.1 hivemetastore
+ 127.0.0.1 sparkmaster
+ ```
+
+ 启动后,可以通过如下命令启动 hive query
+
+ ```
+ docker exec -it adhoc-2 /bin/bash
+
+ beeline -u jdbc:hive2://hiveserver:10000 \
+ --hiveconf
hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat \
+ --hiveconf hive.stats.autogather=false
+
+ show tables;
+ show partitions stock_ticks_mor_rt;
+ select symbol, max(ts) from stock_ticks_cow group by symbol HAVING
symbol = 'GOOG';
+ select symbol, max(ts) from stock_ticks_mor_ro group by symbol HAVING
symbol = 'GOOG';
+ exit;
+ ```
+
+ 也可以通过 spark-shell 进行访问:
+
+ ```
+ docker exec -it adhoc-1 /bin/bash
+
+ $SPARK_INSTALL/bin/spark-shell \
+ --jars
/var/scripts/hudi_docker_compose_attached_file/jar/hoodie-hive-sync-bundle.jar \
+ --master local[2] \
+ --driver-class-path $HADOOP_CONF_DIR \
+ --conf spark.sql.hive.convertMetastoreParquet=false \
+ --deploy-mode client \
+ --driver-memory 1G \
+ --executor-memory 3G \
+ --num-executors 1
+
+ spark.sql("show tables").show(100, false)
+ spark.sql("select symbol, max(ts) from stock_ticks_cow group by symbol
HAVING symbol = 'GOOG'").show(100, false)
+ spark.sql("select `_hoodie_commit_time`, symbol, ts, volume, open, close
from stock_ticks_cow where symbol = 'GOOG'").show(100, false)
+ spark.sql("select symbol, max(ts) from stock_ticks_mor_ro group by
symbol HAVING symbol = 'GOOG'").show(100, false)
+ spark.sql("select symbol, max(ts) from stock_ticks_mor_rt group by
symbol HAVING symbol = 'GOOG'").show(100, false)
+ spark.sql("select `_hoodie_commit_time`, symbol, ts, volume, open, close
from stock_ticks_mor_ro where symbol = 'GOOG'").show(100, false)
+ :q
+ ```
+
+ 更多使用方式可参阅 [Hudi 官方文档](https://hudi.apache.org/docs/docker_demo)。
2. 运行回归测试
外表相关的回归测试默认是关闭的,可以修改 `regression-test/conf/regression-conf.groovy`
中的以下配置来开启:
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]