soumilshah1995 commented on issue #8400: URL: https://github.com/apache/hudi/issues/8400#issuecomment-1636454180
@AmareshB Sure @AmareshB # Step 1 : Create EMR 6.11 Cluster ![image](https://github.com/apache/hudi/assets/39345855/320ac005-344f-4da6-a02c-1cdad5462226) # Step2 : Create MOR table ``` try: import sys import os from pyspark.context import SparkContext from pyspark.sql.session import SparkSession from awsglue.context import GlueContext from awsglue.job import Job from awsglue.dynamicframe import DynamicFrame from pyspark.sql.functions import col, to_timestamp, monotonically_increasing_id, to_date, when from pyspark.sql.functions import * from awsglue.utils import getResolvedOptions from pyspark.sql.types import * from datetime import datetime, date import boto3 from functools import reduce from pyspark.sql import Row import uuid from faker import Faker except Exception as e: print("Modules are missing : {} ".format(e)) spark = (SparkSession.builder.config('spark.serializer', 'org.apache.spark.serializer.KryoSerializer') \ .config('spark.sql.hive.convertMetastoreParquet', 'false') \ .config('spark.sql.catalog.spark_catalog', 'org.apache.spark.sql.hudi.catalog.HoodieCatalog') \ .config('spark.sql.extensions', 'org.apache.spark.sql.hudi.HoodieSparkSessionExtension') \ .config('spark.sql.legacy.pathOptionBehavior.enabled', 'true').getOrCreate()) sc = spark.sparkContext glueContext = GlueContext(sc) job = Job(glueContext) logger = glueContext.get_logger() # =================================INSERTING DATA ===================================== global faker faker = Faker() class DataGenerator(object): @staticmethod def get_data(): return [ ( x, faker.name(), faker.random_element(elements=('IT', 'HR', 'Sales', 'Marketing')), faker.random_element(elements=('CA', 'NY', 'TX', 'FL', 'IL', 'RJ')), str(faker.random_int(min=10000, max=150000)), str(faker.random_int(min=18, max=60)), str(faker.random_int(min=0, max=100000)), str(faker.unix_time()), faker.email(), faker.credit_card_number(card_type='amex'), ) for x in range(5) ] # ============================== Settings ======================================= db_name = "hudidb" table_name = "employees" recordkey = 'emp_id' precombine = "ts" PARTITION_FIELD = 'state' path = "s3://soumilshah-hudi-demos/hudi/" method = 'upsert' table_type = "MERGE_ON_READ" # ==================================================================================== hudi_part_write_config = { 'className': 'org.apache.hudi', 'hoodie.table.name': table_name, 'hoodie.datasource.write.table.type': table_type, 'hoodie.datasource.write.operation': method, 'hoodie.datasource.write.recordkey.field': recordkey, 'hoodie.datasource.write.precombine.field': precombine, "hoodie.schema.on.read.enable": "true", "hoodie.datasource.write.reconcile.schema": "true", 'hoodie.datasource.hive_sync.mode': 'hms', 'hoodie.datasource.hive_sync.enable': 'true', 'hoodie.datasource.hive_sync.use_jdbc': 'false', 'hoodie.datasource.hive_sync.support_timestamp': 'false', 'hoodie.datasource.hive_sync.database': db_name, 'hoodie.datasource.hive_sync.table': table_name , "hoodie.compact.inline": "false" , 'hoodie.compact.schedule.inline': 'true' , "hoodie.metadata.index.check.timeout.seconds": "60" , "hoodie.write.concurrency.mode": "optimistic_concurrency_control" , "hoodie.write.lock.provider": "org.apache.hudi.client.transaction.lock.InProcessLockProvider" } # ==================================================== """Create Spark Data Frame """ # ==================================================== data = DataGenerator.get_data() columns = ["emp_id", "employee_name", "department", "state", "salary", "age", "bonus", "ts"] df = spark.createDataFrame(data=data, schema=columns) df.write.format("hudi").options(**hudi_part_write_config).mode("overwrite").save(path) # ==================================================== """APPEND """ # ==================================================== impleDataUpd = [ (6, "This is APPEND", "Sales", "RJ", 81000, 30, 23000, 827307999), (7, "This is APPEND", "Engineering", "RJ", 79000, 53, 15000, 1627694678), ] columns = ["emp_id", "employee_name", "department", "state", "salary", "age", "bonus", "ts"] usr_up_df = spark.createDataFrame(data=impleDataUpd, schema=columns) usr_up_df.write.format("hudi").options(**hudi_part_write_config).mode("append").save(path) # ==================================================== """UPDATE """ # ==================================================== impleDataUpd = [ (3, "this is update 1** on data lake", "Sales", "RJ", 81000, 30, 23000, 827307999), ] columns = ["emp_id", "employee_name", "department", "state", "salary", "age", "bonus", "ts"] usr_up_df = spark.createDataFrame(data=impleDataUpd, schema=columns) usr_up_df.write.format("hudi").options(**hudi_part_write_config).mode("append").save(path) ``` # Step 3: Fire job ``` """ https://github.com/apache/hudi/issues/8400 """ try: import json import uuid import os import boto3 from dotenv import load_dotenv load_dotenv("../.env") except Exception as e: pass global AWS_ACCESS_KEY global AWS_SECRET_KEY global AWS_REGION_NAME AWS_ACCESS_KEY = os.getenv("DEV_ACCESS_KEY") AWS_SECRET_KEY = os.getenv("DEV_SECRET_KEY") AWS_REGION_NAME = "us-east-1" client = boto3.client("emr-serverless", aws_access_key_id=AWS_ACCESS_KEY, aws_secret_access_key=AWS_SECRET_KEY, region_name=AWS_REGION_NAME) def lambda_handler_test_emr(event, context): # ============================== Settings ======================================= table_name = "employees" recordkey = 'emp_id' precombine = "ts" path = "s3://soumilshah-hudi-demos/hudi/" # ==================================================================================== # --------------------------------------------------------------------------------- # EMR # -------------------------------------------------------------------------------- ApplicationId = os.getenv("ApplicationId") ExecutionTime = 600 ExecutionArn = os.getenv("ExecutionArn") JobName = 'hudi_compaction_{}'.format(table_name) jar_path = "s3://soumilshah-hudi-demos/jar/hudi-spark3.3-bundle_2.12-0.13.0.jar" # -------------------------------------------------------------------------------- spark_submit_parameters = ' --conf spark.serializer=org.apache.spark.serializer.KryoSerializer' spark_submit_parameters += ' --class org.apache.hudi.utilities.HoodieCompactor' # schedule | execute | scheduleAndExecute arguments = [ '--spark-memory', '5g', '--parallelism', '2', "--mode", "schedule", "--base-path", path, "--table-name", table_name, "--hoodie-conf", "hoodie.datasource.write.recordkey.field={}".format(recordkey), "--hoodie-conf", "hoodie.datasource.write.precombine.field={}".format(precombine), "--hoodie-conf", "hoodie.compact.schedule.inline=true", "--hoodie-conf", "hoodie.compact.inline.max.delta.commits=1" ] response = client.start_job_run( applicationId=ApplicationId, clientToken=uuid.uuid4().__str__(), executionRoleArn=ExecutionArn, jobDriver={ 'sparkSubmit': { 'entryPoint': "command-runner.jar", 'entryPointArguments': arguments, 'sparkSubmitParameters': spark_submit_parameters }, }, executionTimeoutMinutes=ExecutionTime, name=JobName, ) print("response", end="\n") print(response) lambda_handler_test_emr(context=None, event=None) ``` ![image](https://github.com/apache/hudi/assets/39345855/15f43e91-fc58-4b7e-8587-5615c55db6b5) # Now again trying with custom jar ``` """ https://github.com/apache/hudi/issues/8400 """ try: import json import uuid import os import boto3 from dotenv import load_dotenv load_dotenv("../.env") except Exception as e: pass global AWS_ACCESS_KEY global AWS_SECRET_KEY global AWS_REGION_NAME AWS_ACCESS_KEY = os.getenv("DEV_ACCESS_KEY") AWS_SECRET_KEY = os.getenv("DEV_SECRET_KEY") AWS_REGION_NAME = "us-east-1" client = boto3.client("emr-serverless", aws_access_key_id=AWS_ACCESS_KEY, aws_secret_access_key=AWS_SECRET_KEY, region_name=AWS_REGION_NAME) def lambda_handler_test_emr(event, context): # ============================== Settings ======================================= table_name = "employees" recordkey = 'emp_id' precombine = "ts" path = "s3://soumilshah-hudi-demos/hudi/" # ==================================================================================== # --------------------------------------------------------------------------------- # EMR # -------------------------------------------------------------------------------- ApplicationId = os.getenv("ApplicationId") ExecutionTime = 600 ExecutionArn = os.getenv("ExecutionArn") JobName = 'hudi_compaction_{}'.format(table_name) jar_path = "s3://soumilshah-hudi-demos/jar/hudi-spark3.3-bundle_2.12-0.13.0.jar" # -------------------------------------------------------------------------------- spark_submit_parameters = ' --conf spark.serializer=org.apache.spark.serializer.KryoSerializer' spark_submit_parameters += ' --class org.apache.hudi.utilities.HoodieCompactor' # schedule | execute | scheduleAndExecute arguments = [ '--spark-memory', '5g', '--parallelism', '2', "--mode", "schedule", "--base-path", path, "--table-name", table_name, "--hoodie-conf", "hoodie.datasource.write.recordkey.field={}".format(recordkey), "--hoodie-conf", "hoodie.datasource.write.precombine.field={}".format(precombine), "--hoodie-conf", "hoodie.compact.schedule.inline=true", "--hoodie-conf", "hoodie.compact.inline.max.delta.commits=1" ] # arguments = [ # '--spark-memory', '1g', # '--parallelism', '2', # "--mode", "schedule", # "--base-path", path, # "--table-name", table_name # ] response = client.start_job_run( applicationId=ApplicationId, clientToken=uuid.uuid4().__str__(), executionRoleArn=ExecutionArn, jobDriver={ 'sparkSubmit': { 'entryPoint': jar_path, 'entryPointArguments': arguments, 'sparkSubmitParameters': spark_submit_parameters }, }, executionTimeoutMinutes=ExecutionTime, name=JobName, ) print("response", end="\n") print(response) lambda_handler_test_emr(context=None, event=None) ``` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org