This is an automated email from the ASF dual-hosted git repository.
janardhan pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/systemds.git
The following commit(s) were added to refs/heads/main by this push:
new eb29b2d548 [SYSTEMDS-2926] AWS scripts update for EMR-7.0.0 (#2003)
eb29b2d548 is described below
commit eb29b2d5488d503c97c6b5845a7fffee9a225eed
Author: Lachezar Nikolov <[email protected]>
AuthorDate: Sun Feb 25 12:57:18 2024 +0100
[SYSTEMDS-2926] AWS scripts update for EMR-7.0.0 (#2003)
The changes fix some general issues:
- creating and referencing the S3 buckets
- not assigning any sub-network for the cluster (bad practice + potential
security vulnerability)
The changes also update the used EMR version to the currently most recent
one: emr-7.0.0:
- configurations update
- exchanging Ganglia with AmazonCloudWatchAgent
see
https://docs.aws.amazon.com/emr/latest/ReleaseGuide/emr-AmazonCloudWatchAgent.html
While testing the script with the current repo version the following bug
was observed: when launching SystemDS in execution mode "spark" an
`IllegalCallerException` is thrown.
For running the command `spark-submit target/SystemDS.jar -f
path/to/hello.dml -exec spark -stats -explain` the exact output in the console
is:
```shell
...
--MAIN PROGRAM
----GENERIC (lines 1-1) [recompile=false]
------CP print Hello World.SCALAR.STRING.true _Var0.SCALAR.STRING 8
------CP rmvar _Var0
An Error Occurred :
IllegalCallerException -- java.lang.ref is not open to unnamed module
@4eba373c
```
---
scripts/aws/run_systemds_script.sh | 4 ++--
scripts/aws/spinup_systemds_cluster.sh | 21 +++++++++++++++------
scripts/aws/systemds_cluster.config | 4 ++--
3 files changed, 19 insertions(+), 10 deletions(-)
diff --git a/scripts/aws/run_systemds_script.sh
b/scripts/aws/run_systemds_script.sh
index 4c58fcec9b..db2d7185e2 100755
--- a/scripts/aws/run_systemds_script.sh
+++ b/scripts/aws/run_systemds_script.sh
@@ -32,7 +32,7 @@ fi
source systemds_cluster.config
-aws s3 cp $1 s3://system-ds-bucket/ --exclude "*" --include "*.dml"
+aws s3 cp $1 s3://${BUCKET} --exclude "*" --include "*.dml"
if [ ! -z "$2" ]
then
@@ -50,7 +50,7 @@ STEP_INFO=$(aws emr add-steps --cluster-id $CLUSTER_ID
--steps "Type=Spark,
--driver-memory,$SPARK_DRIVER_MEMORY,
--num-executors,$SPARK_NUM_EXECUTORS,
--conf,spark.driver.maxResultSize=0,
- $SYSTEMDS_JAR_PATH, -f, s3://system-ds-bucket/$dml_filename, -exec,
$SYSTEMDS_EXEC_MODE,$args,-stats, -explain]")
+ $SYSTEMDS_JAR_PATH, -f, s3://$BUCKET/$dml_filename, -exec,
$SYSTEMDS_EXEC_MODE,$args,-stats, -explain]")
STEP_ID=$(echo $STEP_INFO | jq .StepIds | tr -d '"' | tr -d ']' | tr -d '[' |
tr -d '[:space:]' )
echo "Waiting for the step to finish"
diff --git a/scripts/aws/spinup_systemds_cluster.sh
b/scripts/aws/spinup_systemds_cluster.sh
index 8c93a636ea..58f9f2db05 100755
--- a/scripts/aws/spinup_systemds_cluster.sh
+++ b/scripts/aws/spinup_systemds_cluster.sh
@@ -46,9 +46,12 @@ set_config "SPARK_EXECUTOR_MEMORY" $SPARK_EXECUTOR_MEMORY
set_config "SPARK_DRIVER_MEMORY" "1G"
set_config "BUCKET" $BUCKET-$(((RANDOM % 999) + 1000))
-#Create systemDS bucket
-aws s3api create-bucket --bucket $BUCKET --region $REGION &> /dev/null
-aws s3api create-bucket --bucket $BUCKET-logs --region $REGION &> /dev/null
+#Source again to update the changes for the current session
+source systemds_cluster.config
+
+#Create systemDS bucket (LocationConstraint configuration required regions
outside of us-east-1)
+aws s3api create-bucket --bucket $BUCKET --region $REGION
--create-bucket-configuration LocationConstraint=$REGION &> /dev/null
+aws s3api create-bucket --bucket $BUCKET-logs --region $REGION
--create-bucket-configuration LocationConstraint=$REGION &> /dev/null
# Upload Jar and scripts to s3
aws s3 sync $SYSTEMDS_TARGET_DIRECTORY s3://$BUCKET --exclude "*" --include
"*.dml" --include "*config.xml" --include "*DS.jar*"
@@ -60,11 +63,17 @@ if [ ! -f ${KEYPAIR_NAME}.pem ]; then
echo "${KEYPAIR_NAME}.pem private key created!"
fi
+#Get the first available subnet in the default VPC of the configured region
+DEFAULT_SUBNET=$(aws ec2 describe-subnets --region $REGION \
+ --filter "Name=defaultForAz,Values=true" --query "Subnets[0].SubnetId"
--output text)
+
#Create the cluster
+#Note: Ganglia not available since emr-6.15.0: exchanged with
AmazonCloudWatchAgent
CLUSTER_INFO=$(aws emr create-cluster \
- --applications Name=Ganglia Name=Spark \
+ --applications Name=AmazonCloudWatchAgent Name=Spark \
--ec2-attributes '{"KeyName":"'${KEYPAIR_NAME}'",
- "InstanceProfile":"EMR_EC2_DefaultRole"}'\
+ "InstanceProfile":"EMR_EC2_DefaultRole",
+ "SubnetId": "'${DEFAULT_SUBNET}'"}'\
--service-role EMR_DefaultRole \
--enable-debugging \
--release-label $EMR_VERSION \
@@ -104,6 +113,6 @@ echo "Cluster info:"
export CLUSTER_URL=$(aws emr describe-cluster --cluster-id $CLUSTER_ID | jq
.Cluster.MasterPublicDnsName | tr -d '"')
aws emr ssh --cluster-id $CLUSTER_ID --key-pair-file ${KEYPAIR_NAME}.pem
--region $REGION \
- --command 'aws s3 cp s3://system-ds-bucket/target . --recursive --exclude
"*" --include "*DS.jar*"'
+ --command 'aws s3 cp s3://'${BUCKET}' . --recursive --exclude "*"
--include "*DS.jar*"'
echo "Spinup finished."
diff --git a/scripts/aws/systemds_cluster.config
b/scripts/aws/systemds_cluster.config
index a254bbc864..8afed8d2bb 100644
--- a/scripts/aws/systemds_cluster.config
+++ b/scripts/aws/systemds_cluster.config
@@ -23,8 +23,8 @@
KEYPAIR_NAME="SystemDSkeynamex"
REGION="us-east-1"
-BUCKET="systemds-bucket"
-EMR_VERSION="emr-5.28.0"
+BUCKET=systemds-bucket
+EMR_VERSION="emr-7.0.0"
INSTANCES_TYPE="m5.xlarge"
MASTER_INSTANCES_COUNT=1