opensearch-project · penghuo · Jan 31, 2023 · Jan 31, 2023
diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml
@@ -0,0 +1,92 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+x-spark-image: &spark-image maximus/spark:v3.2.1
+x-spark-voluems: &spark-volumes
+  - spark_data:/usr/share/spark
+  - $HOME/.aws:/root/.aws:ro
+  - $HOME/tmp/maximus:/root/maximus:ro
+
+version: "3.7"
+services:
+  opensearch:
+    image: opensearchproject/opensearch:2.3.0
+    environment:
+      - discovery.type=single-node
+      - "DISABLE_INSTALL_DEMO_CONFIG=true"
+      - "DISABLE_SECURITY_PLUGIN=true"
+    ports:
+      - 9200:9200
+      - 9600:9600
+    container_name: opensearch
+    restart: unless-stopped
+    volumes:
+      - os_data:/usr/share/opensearch/data
+
+  spark-master:
+    image: *spark-image
+    container_name: spark-master
+    ports:
+      - "8080:8080"
+      - "7077:7077"
+      - "4040:4040"
+    volumes: *spark-volumes
+    environment:
+      - SPARK_LOCAL_IP=spark-master
+      - SPARK_WORKLOAD=master
+
+  spark-worker-1:
+    image: *spark-image
+    container_name: spark-worker-1
+    ports:
+      - "8081:8080"
+      - "7001:7077"
+    depends_on:
+      - spark-master
+    volumes: *spark-volumes
+    environment: 
+      - SPARK_MASTER=spark://spark-master:7077
+      - SPARK_WORKER_CORES=1
+      - SPARK_WORKER_MEMORY=2G
+      - SPARK_DRIVER_MEMORY=2G
+      - SPARK_EXECUTOR_MEMORY=2G
+      - SPARK_WORKLOAD=worker
+      - SPARK_LOCAL_IP=spark-worker-1
+
+  spark-worker-2:
+    image: *spark-image
+    container_name: spark-worker-2
+    ports:
+      - "8082:8080"
+      - "7002:7077"
+    depends_on:
+      - spark-master
+    volumes: *spark-volumes
+    environment: 
+      - SPARK_MASTER=spark://spark-master:7077
+      - SPARK_WORKER_CORES=1
+      - SPARK_WORKER_MEMORY=2G
+      - SPARK_DRIVER_MEMORY=2G
+      - SPARK_EXECUTOR_MEMORY=2G
+      - SPARK_WORKLOAD=worker
+      - SPARK_LOCAL_IP=spark-worker-2
+
+volumes:
+  os_data:
+    external: false
+  spark_data:
+    external: false
diff --git a/docker/spark/Dockerfile b/docker/spark/Dockerfile
@@ -0,0 +1,48 @@
+FROM ubuntu:18.04
+
+LABEL authors="penghuo@gmail.com"
+
+RUN apt update
+RUN apt install default-jdk -y \
+&& apt install wget -y \
+&& apt install supervisor -y
+
+ENV SPARK_VERSION 3.2.3
+ENV HADOOP_VERSION 3.2
+
+# download and extract Spark 
+RUN wget https://dlcdn.apache.org/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz \
+&&  tar -xzf spark-$SPARK_VERSION-bin-hadoop$HADOOP_VERSION.tgz \
+&&  mv spark-$SPARK_VERSION-bin-hadoop$HADOOP_VERSION /opt/spark
+
+# add configuration
+COPY spark-defaults.conf /opt/spark/conf/spark-defaults.conf
+COPY log4j.properties /opt/spark/conf/log4j.properties
+COPY jars/*.* /opt/spark/jars
+COPY supervisord.conf /etc/supervisor/conf.d/supervisord.conf
+
+WORKDIR /opt/spark
+
+ENV SPARK_MASTER_PORT=7077 \
+SPARK_MASTER_WEBUI_PORT=8080 \
+SPARK_LOG_DIR=/opt/spark/logs \
+SPARK_MASTER_LOG=/opt/spark/logs/spark-master.out \
+SPARK_WORKER_LOG=/opt/spark/logs/spark-worker.out \
+SPARK_WORKER_WEBUI_PORT=8080 \
+SPARK_WORKER_PORT=7000 \
+SPARK_MASTER="spark://spark-master:7077" \
+SPARK_WORKLOAD="master" \
+SPARK_HOME=/opt/spark
+
+EXPOSE 4040 8080 7077 6066 10000
+
+# CMD ["/usr/bin/supervisord"]
+RUN mkdir -p $SPARK_LOG_DIR && \
+touch $SPARK_MASTER_LOG && \
+touch $SPARK_WORKER_LOG && \
+ln -sf /dev/stdout $SPARK_MASTER_LOG && \
+ln -sf /dev/stdout $SPARK_WORKER_LOG
+
+COPY start-spark.sh /
+
+CMD ["/bin/bash", "/start-spark.sh"]
diff --git a/docker/spark/jars/opensearch-spark-30_2.12-3.0.0-SNAPSHOT.jar b/docker/spark/jars/opensearch-spark-30_2.12-3.0.0-SNAPSHOT.jar
diff --git a/docker/spark/log4j.properties b/docker/spark/log4j.properties
@@ -0,0 +1,53 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# Set everything to be logged to the console
+log4j.rootCategory=INFO,file
+log4j.appender.console=org.apache.log4j.ConsoleAppender
+log4j.appender.console.target=System.err
+log4j.appender.console.layout=org.apache.log4j.PatternLayout
+log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n
+
+# file appender
+log4j.appender.file=org.apache.log4j.RollingFileAppender
+log4j.appender.file.File=/Users/penghuo/release/spark/logs/spark.log
+log4j.appender.file.rollingPolicy.FileNamePattern=/Users/penghuo/release/spark/logs/spark.%d{HH-mm-dd}.log
+log4j.appender.file.layout=org.apache.log4j.PatternLayout
+log4j.appender.file.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n
+
+# Set the default spark-shell log level to WARN. When running the spark-shell, the
+# log level for this class is used to overwrite the root logger's log level, so that
+# the user can have different defaults for the shell and regular Spark apps.
+log4j.logger.org.apache.spark.repl.Main=WARN
+
+# Settings to quiet third party logs that are too verbose
+log4j.logger.org.sparkproject.jetty=WARN
+log4j.logger.org.sparkproject.jetty.util.component.AbstractLifeCycle=ERROR
+log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=INFO
+log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=INFO
+log4j.logger.org.apache.parquet=ERROR
+log4j.logger.parquet=ERROR
+
+# SPARK-9183: Settings to avoid annoying messages when looking up nonexistent UDFs in SparkSQL with Hive support
+log4j.logger.org.apache.hadoop.hive.metastore.RetryingHMSHandler=FATAL
+log4j.logger.org.apache.hadoop.hive.ql.exec.FunctionRegistry=ERROR
+
+# For deploying Spark ThriftServer
+# SPARK-34128：Suppress undesirable TTransportException warnings involved in THRIFT-4805
+log4j.appender.console.filter.1=org.apache.log4j.varia.StringMatchFilter
+log4j.appender.console.filter.1.StringToMatch=Thrift error occurred during processing of message
+log4j.appender.console.filter.1.AcceptOnMatch=false
diff --git a/docker/spark/spark-defaults.conf b/docker/spark/spark-defaults.conf
@@ -0,0 +1,38 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# Default system properties included when running spark-submit.
+# This is useful for setting default environmental settings.
+
+# Example:
+# spark.master                     spark://master:7077
+# spark.eventLog.enabled           true
+# spark.eventLog.dir               hdfs://namenode:8021/directory
+# spark.serializer                 org.apache.spark.serializer.KryoSerializer
+# spark.driver.memory              5g
+# spark.executor.extraJavaOptions  -XX:+PrintGCDetails -Dkey=value -Dnumbers="one two three"
+
+# spark.sql.catalog.spark_catalog   org.apache.spark.sql.delta.catalog.DeltaCatalog
+
+# metadata
+spark.sql.warehouse.dir                             /usr/share/spark/spark-warehouse
+spark.driver.extraJavaOptions                       -Dderby.system.home=/usr/share/spark/metastore_db
+
+# s3 configuration
+spark.hadoop.fs.s3a.impl                            org.apache.hadoop.fs.s3a.S3AFileSystem
+spark.hadoop.fs.s3a.aws.credentials.provider        com.amazonaws.auth.profile.ProfileCredentialsProvider,com.amazonaws.auth.DefaultAWSCredentialsProviderChain
+spark.hadoop.fs.AbstractFileSystem.s3a.impl         org.apache.hadoop.fs.s3a.S3A
diff --git a/docker/spark/start-spark.sh b/docker/spark/start-spark.sh
@@ -0,0 +1,23 @@
+#start-spark.sh
+#!/bin/bash
+. "/opt/spark/bin/load-spark-env.sh"
+# When the spark work_load is master run class org.apache.spark.deploy.master.Master
+if [ "$SPARK_WORKLOAD" == "master" ];
+then
+
+export SPARK_MASTER_HOST=`hostname`
+
+cd /opt/spark/bin && ./spark-class org.apache.spark.deploy.master.Master --ip $SPARK_MASTER_HOST --port $SPARK_MASTER_PORT --webui-port $SPARK_MASTER_WEBUI_PORT >> $SPARK_MASTER_LOG
+
+elif [ "$SPARK_WORKLOAD" == "worker" ];
+then
+# When the spark work_load is worker run class org.apache.spark.deploy.master.Worker
+cd /opt/spark/bin && ./spark-class org.apache.spark.deploy.worker.Worker --webui-port $SPARK_WORKER_WEBUI_PORT $SPARK_MASTER >> $SPARK_WORKER_LOG
+
+elif [ "$SPARK_WORKLOAD" == "thrift" ];
+then
+# Launch Thrift Server
+/usr/bin/supervisord
+else
+    echo "Undefined Workload Type $SPARK_WORKLOAD, must specify: master, worker, thrift"
+fi
diff --git a/docker/spark/supervisord.conf b/docker/spark/supervisord.conf
@@ -0,0 +1,5 @@
+[supervisord]
+nodaemon=true
+
+[program:spark]
+command=/opt/spark/sbin/start-thriftserver.sh --master %(ENV_SPARK_MASTER)s