Spark(Hadoop) worker节点镜像制作
- 安装 agent
下载青云提供的 app agent Linux 版本, Windows 版本,解压后运行 ./install.sh (Windows 下双击 install.bat)
创建 toml 文件
创建 /etc/confd/conf.d/spark-env.sh.toml
[template] src = "spark-env.sh.tmpl" dest = "/opt/spark/conf/spark-env.sh" keys = [ "/", ] reload_cmd = "/opt/spark/sbin/restart-slave.sh"
创建 /etc/confd/conf.d/workers.toml
[template] src = "workers.tmpl" dest = "/opt/spark/conf/slaves" keys = [ "/", ]
创建 /etc/confd/conf.d/core-site.xml.toml
[template] src = "core-site.xml.tmpl" dest = "/opt/hadoop/etc/hadoop/core-site.xml" keys = [ "/", ] reload_cmd = "/opt/hadoop/sbin/restart-slave.sh"
创建 /etc/confd/conf.d/hdfs-site.xml.toml
[template] src = "hdfs-site.xml.tmpl" dest = "/opt/hadoop/etc/hadoop/hdfs-site.xml" keys = [ "/", ] reload_cmd = "/opt/hadoop/sbin/restart-slave.sh"
创建 /etc/confd/conf.d/slaves.toml
[template] src = "slaves.tmpl" dest = "/opt/hadoop/etc/hadoop/slaves" keys = [ "/", ]
创建 /etc/confd/conf.d/authorized_keys.toml
[template] src = "authorized_keys.tmpl" dest = "/root/.ssh/authorized_keys" keys = [ "/", ]
创建 /etc/confd/conf.d/hosts.toml
[template] src = "hosts.tmpl" dest = "/etc/hosts" keys = [ "/", ]
创建 tmpl 文件
创建 /etc/confd/templates/spark-env.sh.tmpl
#! /usr/bin/env bash export SPARK_LOG_DIR=/bigdata1/spark/logs export SPARK_WORKER_DIR=/bigdata1/spark/work export SPARK_WORKER_OPTS="-Dspark.worker.cleanup.enabled=true -Dspark.worker.cleanup.interval=28800 -Dspark.worker.cleanup.appDataTtl=86400" export SPARK_PID_DIR=/bigdata1/spark/pids export SPARK_LOCAL_DIRS=/bigdata1/spark export HADOOP_HOME=/opt/hadoop/etc/hadoop {{range $dir := lsdir "/hosts/spark-master/"}}{{$ip := printf "/hosts/spark-master/%s/ip" $dir}} export SPARK_MASTER_IP={{getv $ip}}{{end}}
创建 /etc/confd/templates/workers.tmpl
{{range $dir := lsdir "/hosts/worker/"}}{{$ip := printf "/hosts/worker/%s/ip" $dir}} {{getv $ip}}{{end}}
创建 /etc/confd/templates/core-site.xml.tmpl
<?xml version="1.0" encoding="UTF-8"?> <?xml-stylesheet type="text/xsl" href="configuration.xsl"?> <configuration> <property> <name>fs.defaultFS</name> {{range $dir := lsdir "/hosts/hadoop-master/"}}{{$ip := printf "/hosts/hadoop-master/%s/ip" $dir}} <value>hdfs://{{getv $ip}}:9000</value>{{end}} </property> <property> <name>hadoop.tmp.dir</name> <value>/bigdata1/hadoop/tmp</value> </property> <property> <name>dfs.hosts.exclude</name> <value>/opt/hadoop/etc/hadoop/exclude</value> </property> <property> <name>io.file.buffer.size</name> <value>131072</value> </property> <property> <name>fs.trash.interval</name> <value>{{getv "/env/fs.trash.interval"}}</value> </property> </configuration>
创建 /etc/confd/templates/hdfs-site.xml.tmpl
<?xml version="1.0" encoding="UTF-8"?> <?xml-stylesheet type="text/xsl" href="configuration.xsl"?> <configuration> <property> <name>dfs.replication</name> <value>{{getv "/env/dfs.replication"}}</value> </property> <property> <name>dfs.replication.max</name> <value>10</value> </property> <property> <name>dfs.replication.min</name> <value>1</value> </property> <property> <name>dfs.datanode.max.transfer.threads</name> <value>40960</value> </property> <property> <name>dfs.namenode.name.dir</name> <value>file:///bigdata1/hadoop/tmp/dfs/name</value> </property> <property> <name>dfs.datanode.data.dir</name> <value>file:///bigdata1/hadoop/tmp/dfs/data,/bigdata2/hadoop/tmp/dfs/data,/bigdata3/hadoop/tmp/dfs/data</value> </property> <property> <name>dfs.webhdfs.enabled</name> <value>true</value> </property> <property> <name>dfs.namenode.handler.count</name> <value>10</value> </property> <property> <name>dfs.namenode.handler.count</name> <value>{{getv "/env/dfs.namenode.handler.count"}}</value> </property> <property> <name>dfs.datanode.handler.count</name> <value>{{getv "/env/dfs.datanode.handler.count"}}</value> </property> <property> <name>dfs.datanode.handler.count</name> <value>10</value> </property> </configuration>
创建 /etc/confd/templates/slaves.tmpl
{{range $dir := lsdir "/hosts/worker/"}}{{$ip := printf "/hosts/worker/%s/ip" $dir}} {{getv $ip}}{{end}}
创建 /etc/confd/templates/authorized_keys.tmpl
{{range $dir := lsdir "/hosts/spark-master/"}}{{$pub_key := printf "/hosts/spark-master/%s/pub_key" $dir}} {{getv $pub_key}}{{end}} {{range $dir := lsdir "/hosts/hadoop-master/"}}{{$pub_key := printf "/hosts/hadoop-master/%s/pub_key" $dir}} {{getv $pub_key}}{{end}}
创建 /etc/confd/templates/hosts.tmpl
{{range $dir := lsdir "/hosts/spark-master/"}}{{$ip := printf "/hosts/spark-master/%s/ip" $dir}} {{getv $ip}} {{$dir}}{{end}} {{range $dir := lsdir "/hosts/hadoop-master/"}}{{$ip := printf "/hosts/hadoop-master/%s/ip" $dir}} {{getv $ip}} {{$dir}}{{end}} {{range $dir := lsdir "/hosts/worker/"}}{{$ip := printf "/hosts/worker/%s/ip" $dir}} {{getv $ip}} {{$dir}}{{end}}
补充脚本
创建 /opt/spark/sbin/restart-slave.sh
#! /bin/sh loop=60 find=0 master_ip="" while [ "$loop" -gt 0 ] do master_ip=`grep SPARK_MASTER_IP /opt/spark/conf/spark-env.sh | cut -d = -f 2` if [ "x$master_ip" = "x" ] then sleep 3s loop=`expr $loop - 1` else find=1 break fi done if [ "$find" -eq 0 ] then echo "Failed to find spark master IP" 1>&2 exit 1 fi pid=`ps ax | grep java | grep org.apache.spark.deploy.worker.Worker | grep -v grep| awk '\{print $1\}'` if [ "x$pid" = "x" ] then /opt/spark/sbin/start-slave.sh spark://$master_ip:7077 return else /opt/spark/sbin/stop-slave.sh fi loop=60 force=1 while [ "$loop" -gt 0 ] do pid=`ps ax | grep java | grep org.apache.spark.deploy.worker.Worker | grep -v grep| awk '\{print $1\}'` if [ "x$pid" = "x" ] then force=0 break else sleep 3s loop=`expr $loop - 1` fi done if [ "$force" -eq 1 ] then kill -9 $pid fi /opt/spark/sbin/start-slave.sh spark://$master_ip:7077
创建 /opt/hadoop/sbin/restart-slave.sh
#! /bin/sh pid=`ps ax | grep java | grep datanode | grep -v grep| awk '\{print $1\}'` if [ "x$pid" = "x" ] then USER=root /opt/hadoop/sbin/hadoop-daemon.sh start datanode return else USER=root /opt/hadoop/sbin/hadoop-daemon.sh stop datanode fi loop=60 force=1 while [ "$loop" -gt 0 ] do pid=`ps ax | grep java | grep datanode | grep -v grep| awk '\{print $1\}'` if [ "x$pid" = "x" ] then force=0 break else sleep 3s loop=`expr $loop - 1` fi done if [ "$force" -eq 1 ] then kill -9 $pid fi USER=root /opt/hadoop/sbin/hadoop-daemon.sh start datanode
在 worker image 上
touch /opt/hadoop/etc/hadoop/exclude