Hello together,
I am trying to run a minimal example in my k8s cluster.
First, I cloned the petastorm github repo: https://github.com/uber/petastorm
Second, I created a Dockerimage as follows:
FROMubuntu:20.04
RUN apt-get update -qq
RUN apt-get install -qq -y software-properties-common
RUN add-apt-repository -yppa:deadsnakes/ppa
RUN apt-get update -qq
RUN apt-get -qq install -y \
build-essential \
cmake \
openjdk-8-jre-headless \
git \
python \
python3-pip \
python3.9 \
python3.9-dev \
python3.9-venv \
virtualenv \
wget \
&& rm -rf /var/lib/apt/lists/*
RUN
wgethttps://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multiclass/mnist.bz2
-P /data/mnist/
RUN mkdir /petastorm
ADD setup.py /petastorm/
ADD README.rst /petastorm/
ADD petastorm /petastorm/petastorm
RUN python3.9 -m pip install pip --upgrade
RUN python3.9 -m pip install wheel
RUN python3.9 -m venv /petastorm_venv3.9
RUN /petastorm_venv3.9/bin/pip3.9 install --no-cache scikit-build
RUN /petastorm_venv3.9/bin/pip3.9 install --no-cache -e
/petastorm/[test,tf,torch,docs,opencv] --only-binary pyarrow --only-binary
opencv-python
RUN /petastorm_venv3.9/bin/pip3.9 install -U pyarrow==3.0.0 numpy==1.19.3
tensorflow==2.5.0 pyspark==3.0.0
RUN /petastorm_venv3.9/bin/pip3.9 install opencv-python-headless
RUN rm -r /petastorm
ADD docker/run_in_venv.sh /
Afterwards, I create a namespace called spark in my k8s cluster, as
Serviceaccount (spark-driver) and a rolebinding for the service account
as follows:
kubectl create ns spark
kubectl create serviceaccount spark-driver
kubectl create rolebinding spark-driver-rb --clusterrole=cluster-admin
--serviceaccount=spark:spark-driver
Finally I create a pod in the spark namespace as follows:
apiVersion: v1
kind: Pod
metadata:
name: "petastorm-ds-creator"
namespace: spark
labels:
app: "petastorm-ds-creator"
spec:
serviceAccount: spark-driver
containers:
- name: petastorm-ds-creator
image: "imagename"
command:
- "/bin/bash"
- "-c"
- "--"
args:
- "while true; do sleep 30; done;"
resources:
limits:
cpu: 2000m
memory: 5000Mi
requests:
cpu: 2000m
memory: 5000Mi
ports:
- containerPort: 80
name: http
- containerPort: 443
name: https
- containerPort: 20022
name: exposed
volumeMounts:
- name: data
mountPath: /data
volumes:
- name: data
persistentVolumeClaim:
claimName: spark-geodata-nfs-pvc-20220503
restartPolicy: Always
I expose port 20022 of the pod with a headless service
kubectl expose pod petastorm-ds-creator --port=20022 --type=ClusterIP
--cluster-ip=None -n spark
finally I run the following code in the created container/pod:
from pyspark import SparkConf
from pyspark.sql import SparkSession
spark_conf = SparkConf()
spark_conf.setMaster("k8s://https://kubernetes.default:443")
spark_conf.setAppName("PetastormDsCreator")
spark_conf.set(
"spark.kubernetes.namespace",
"spark"
)
spark_conf.set(
"spark.kubernetes.authenticate.driver.serviceAccountName",
"spark-driver"
)
spark_conf.set(
"spark.kubernetes.authenticate.caCertFile",
"/var/run/secrets/kubernetes.io/serviceaccount/ca.crt"
)
spark_conf.set(
"spark.kubernetes.authenticate.oauthTokenFile",
"/var/run/secrets/kubernetes.io/serviceaccount/token"
)
spark_conf.set(
"spark.executor.instances",
"2"
)
spark_conf.set(
"spark.driver.host",
"petastorm-ds-creator"
)
spark_conf.set(
"spark.driver.port",
"20022"
)
spark_conf.set(
"spark.kubernetes.container.image",
"imagename"
)
spark = SparkSession.builder.config(conf=spark_conf).getOrCreate()
sc = spark.sparkContext
t = sc.parallelize(range(10))
r = t.sumApprox(3)
print('Approximate sum: %s' % r)
Unfortunately, It does not work...
with kubectl describe po podname-exec-1 I get the following error message:
Error: failed to start container "spark-kubernetes-executor": Error
response from daemon: OCI runtime create failed: container_linux.go:349:
starting container process caused "exec: \"executor\": executable file
not found in $PATH": unknown
Could somebody give me a hint, what am I doing wrong? Is my SparkSession
configuration not correct?
Best regards
Andreas