(systemds) branch main updated: [SYSTEMDS-2926] AWS scripts update for EMR-7.0.0 (#2003)

janardhan Sun, 25 Feb 2024 03:57:33 -0800

This is an automated email from the ASF dual-hosted git repository.

janardhan pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/systemds.git



The following commit(s) were added to refs/heads/main by this push:
     new eb29b2d548 [SYSTEMDS-2926] AWS scripts update for EMR-7.0.0 (#2003)
eb29b2d548 is described below

commit eb29b2d5488d503c97c6b5845a7fffee9a225eed
Author: Lachezar Nikolov <[email protected]>
AuthorDate: Sun Feb 25 12:57:18 2024 +0100

    [SYSTEMDS-2926] AWS scripts update for EMR-7.0.0 (#2003)
    
    The changes fix some general issues:
    - creating and referencing the S3 buckets
    - not assigning any sub-network for the cluster (bad practice + potential 
security vulnerability)
    
    The changes also update the used EMR version to the currently most recent 
one: emr-7.0.0:
    - configurations update
    - exchanging Ganglia with AmazonCloudWatchAgent
      see  
https://docs.aws.amazon.com/emr/latest/ReleaseGuide/emr-AmazonCloudWatchAgent.html
    
    While testing the script with the current repo version the following bug 
was observed: when launching SystemDS in execution mode "spark" an 
`IllegalCallerException` is thrown.
    For running the command `spark-submit target/SystemDS.jar -f 
path/to/hello.dml -exec spark -stats -explain` the exact output in the console 
is:
    
    ```shell
    ...
    --MAIN PROGRAM
    ----GENERIC (lines 1-1) [recompile=false]
    ------CP print Hello World.SCALAR.STRING.true _Var0.SCALAR.STRING 8
    ------CP rmvar _Var0
    
    
    An Error Occurred :
       IllegalCallerException -- java.lang.ref is not open to unnamed module 
@4eba373c
    
    ```
---
 scripts/aws/run_systemds_script.sh     |  4 ++--
 scripts/aws/spinup_systemds_cluster.sh | 21 +++++++++++++++------
 scripts/aws/systemds_cluster.config    |  4 ++--
 3 files changed, 19 insertions(+), 10 deletions(-)

diff --git a/scripts/aws/run_systemds_script.sh 
b/scripts/aws/run_systemds_script.sh
index 4c58fcec9b..db2d7185e2 100755
--- a/scripts/aws/run_systemds_script.sh
+++ b/scripts/aws/run_systemds_script.sh
@@ -32,7 +32,7 @@ fi
 
 source systemds_cluster.config
 
-aws s3 cp $1 s3://system-ds-bucket/ --exclude "*" --include "*.dml"
+aws s3 cp $1 s3://${BUCKET} --exclude "*" --include "*.dml"
 
 if [ ! -z "$2" ]
 then
@@ -50,7 +50,7 @@ STEP_INFO=$(aws emr add-steps --cluster-id $CLUSTER_ID 
--steps "Type=Spark,
         --driver-memory,$SPARK_DRIVER_MEMORY,
         --num-executors,$SPARK_NUM_EXECUTORS,
         --conf,spark.driver.maxResultSize=0,
-        $SYSTEMDS_JAR_PATH, -f, s3://system-ds-bucket/$dml_filename, -exec, 
$SYSTEMDS_EXEC_MODE,$args,-stats, -explain]")
+        $SYSTEMDS_JAR_PATH, -f, s3://$BUCKET/$dml_filename, -exec, 
$SYSTEMDS_EXEC_MODE,$args,-stats, -explain]")
 
 STEP_ID=$(echo $STEP_INFO | jq .StepIds | tr -d '"' | tr -d ']' | tr -d '[' | 
tr -d '[:space:]' )
 echo "Waiting for the step to finish"
diff --git a/scripts/aws/spinup_systemds_cluster.sh 
b/scripts/aws/spinup_systemds_cluster.sh
index 8c93a636ea..58f9f2db05 100755
--- a/scripts/aws/spinup_systemds_cluster.sh
+++ b/scripts/aws/spinup_systemds_cluster.sh
@@ -46,9 +46,12 @@ set_config "SPARK_EXECUTOR_MEMORY" $SPARK_EXECUTOR_MEMORY
 set_config "SPARK_DRIVER_MEMORY" "1G"
 set_config "BUCKET" $BUCKET-$(((RANDOM % 999) + 1000))
 
-#Create systemDS bucket
-aws s3api create-bucket --bucket $BUCKET --region $REGION &> /dev/null
-aws s3api create-bucket --bucket $BUCKET-logs --region $REGION &> /dev/null
+#Source again to update the changes for the current session
+source systemds_cluster.config
+
+#Create systemDS bucket (LocationConstraint configuration required regions 
outside of us-east-1)
+aws s3api create-bucket --bucket $BUCKET --region $REGION 
--create-bucket-configuration LocationConstraint=$REGION &> /dev/null
+aws s3api create-bucket --bucket $BUCKET-logs --region $REGION 
--create-bucket-configuration LocationConstraint=$REGION &> /dev/null
 
 # Upload Jar and scripts to s3
 aws s3 sync $SYSTEMDS_TARGET_DIRECTORY s3://$BUCKET --exclude "*" --include 
"*.dml" --include "*config.xml" --include "*DS.jar*"
@@ -60,11 +63,17 @@ if [ ! -f ${KEYPAIR_NAME}.pem ]; then
     echo "${KEYPAIR_NAME}.pem private key created!"
 fi
 
+#Get the first available subnet in the default VPC of the configured region
+DEFAULT_SUBNET=$(aws ec2 describe-subnets --region $REGION \
+  --filter "Name=defaultForAz,Values=true" --query "Subnets[0].SubnetId" 
--output text)
+
 #Create the cluster
+#Note: Ganglia not available since emr-6.15.0: exchanged with 
AmazonCloudWatchAgent
 CLUSTER_INFO=$(aws emr create-cluster \
- --applications Name=Ganglia Name=Spark \
+ --applications Name=AmazonCloudWatchAgent Name=Spark \
  --ec2-attributes '{"KeyName":"'${KEYPAIR_NAME}'",
-  "InstanceProfile":"EMR_EC2_DefaultRole"}'\
+  "InstanceProfile":"EMR_EC2_DefaultRole",
+  "SubnetId": "'${DEFAULT_SUBNET}'"}'\
  --service-role EMR_DefaultRole \
  --enable-debugging \
  --release-label $EMR_VERSION \
@@ -104,6 +113,6 @@ echo "Cluster info:"
 export CLUSTER_URL=$(aws emr describe-cluster --cluster-id $CLUSTER_ID | jq 
.Cluster.MasterPublicDnsName | tr -d '"')
 
 aws emr ssh --cluster-id $CLUSTER_ID --key-pair-file ${KEYPAIR_NAME}.pem 
--region $REGION \
-    --command 'aws s3 cp s3://system-ds-bucket/target . --recursive --exclude 
"*" --include "*DS.jar*"'
+    --command 'aws s3 cp s3://'${BUCKET}' . --recursive --exclude "*" 
--include "*DS.jar*"'
 
 echo "Spinup finished."
diff --git a/scripts/aws/systemds_cluster.config 
b/scripts/aws/systemds_cluster.config
index a254bbc864..8afed8d2bb 100644
--- a/scripts/aws/systemds_cluster.config
+++ b/scripts/aws/systemds_cluster.config
@@ -23,8 +23,8 @@
 
 KEYPAIR_NAME="SystemDSkeynamex"
 REGION="us-east-1"
-BUCKET="systemds-bucket"
-EMR_VERSION="emr-5.28.0"
+BUCKET=systemds-bucket
+EMR_VERSION="emr-7.0.0"
 
 INSTANCES_TYPE="m5.xlarge"
 MASTER_INSTANCES_COUNT=1

(systemds) branch main updated: [SYSTEMDS-2926] AWS scripts update for EMR-7.0.0 (#2003)

Reply via email to