This guide explains how to run benchmarks derived from TPC-H and TPC-DS in Apache DataFusion Comet deployed in a local Microk8s cluster.
Install Micro8s following the instructions at https://microk8s.io/docs/getting-started and then perform these additional steps, ensuring that any existing kube config is backed up first.
mkdir -p ~/.kube
microk8s config > ~/.kube/config
microk8s enable dns
microk8s enable registry
microk8s kubectl create serviceaccount spark
Run the following command from the root of this repository to build the Comet Docker image, or use a published Docker image from https://github.com/orgs/apache/packages?repo_name=datafusion-comet
docker build -t apache/datafusion-comet -f kube/Dockerfile .
Build the benchmark Docker image and push to the Microk8s Docker registry.
docker build -t apache/datafusion-comet-tpcbench .
docker tag apache/datafusion-comet-tpcbench localhost:32000/apache/datafusion-comet-tpcbench:latest
docker push localhost:32000/apache/datafusion-comet-tpcbench:latest
export SPARK_MASTER=k8s://https://127.0.0.1:16443
export COMET_DOCKER_IMAGE=localhost:32000/apache/datafusion-comet-tpcbench:latest
# Location of Comet JAR within the Docker image
export COMET_JAR=/opt/spark/jars/comet-spark-spark3.4_2.12-0.5.0-SNAPSHOT.jar
$SPARK_HOME/bin/spark-submit \
--master $SPARK_MASTER \
--deploy-mode cluster \
--name comet-tpcbench \
--driver-memory 8G \
--conf spark.driver.memory=8G \
--conf spark.executor.instances=1 \
--conf spark.executor.memory=32G \
--conf spark.executor.cores=8 \
--conf spark.cores.max=8 \
--conf spark.task.cpus=1 \
--conf spark.executor.memoryOverhead=3G \
--jars local://$COMET_JAR \
--conf spark.executor.extraClassPath=$COMET_JAR \
--conf spark.driver.extraClassPath=$COMET_JAR \
--conf spark.plugins=org.apache.spark.CometPlugin \
--conf spark.sql.extensions=org.apache.comet.CometSparkSessionExtensions \
--conf spark.comet.enabled=true \
--conf spark.comet.exec.enabled=true \
--conf spark.comet.exec.all.enabled=true \
--conf spark.comet.cast.allowIncompatible=true \
--conf spark.comet.exec.shuffle.enabled=true \
--conf spark.comet.exec.shuffle.mode=auto \
--conf spark.shuffle.manager=org.apache.spark.sql.comet.execution.shuffle.CometShuffleManager \
--conf spark.kubernetes.namespace=default \
--conf spark.kubernetes.driver.pod.name=tpcbench \
--conf spark.kubernetes.container.image=$COMET_DOCKER_IMAGE \
--conf spark.kubernetes.driver.volumes.hostPath.tpcdata.mount.path=/mnt/bigdata/tpcds/sf100/ \
--conf spark.kubernetes.driver.volumes.hostPath.tpcdata.options.path=/mnt/bigdata/tpcds/sf100/ \
--conf spark.kubernetes.executor.volumes.hostPath.tpcdata.mount.path=/mnt/bigdata/tpcds/sf100/ \
--conf spark.kubernetes.executor.volumes.hostPath.tpcdata.options.path=/mnt/bigdata/tpcds/sf100/ \
--conf spark.kubernetes.authenticate.caCertFile=/var/snap/microk8s/current/certs/ca.crt \
local:///opt/datafusion-benchmarks/runners/datafusion-comet/tpcbench.py \
--benchmark tpcds \
--data /mnt/bigdata/tpcds/sf100/ \
--queries /opt/datafusion-benchmarks/tpcds/queries-spark \
--iterations 1