一、这套脚本的组成方式¶
1.1 变量定义层¶
export start_time=$(date -d "1 day ago" "+%Y-%m-%d 00:00:00")
export end_time=$(date -d "0 day ago" "+%Y-%m-%d 00:00:00")
export etcd_host1=<etcd-host-1>
export etcd_host2=<etcd-host-2>
export etcd_host3=<etcd-host-3>
export ansible_hosts=/path/to/hosts
1.2 命令模板层¶
etcd.cmd.tmpl、service.cmd.tmpl 这类模板文件通过 envsubst 渲染成最终命令文件,使同一套巡检逻辑可以在不同环境复用。
二、批量执行框架的核心逻辑¶
2.1 healthcheck.sh 的主流程¶
cd $(dirname $0)
source ../env.sh
envsubst <service.cmd.tmpl >service.cmd
envsubst <etcd.cmd.tmpl >etcd.cmd
. ../run.sh
copy_shdir
run etcd.cmd $DATE
run k8s.cmd $DATE
run os.cmd $DATE
run middleware.cmd $DATE
run service.cmd $DATE
2.2 run.sh 的执行方式¶
[kubectl]
kubectl get node
kubectl get pod -A
[master]
systemctl status kube-apiserver | grep Active:
ansible -i $ansible_hosts $NODE -m shell -a "$line" | tee -a $LOG
三、巡检内容可以拆成哪些主题¶
3.1 基础组件巡检¶
etcd.cmd:ETCD 成员状态、健康状态和 member 列表。k8s.cmd:控制面组件、Node、DNS、Calico 等状态检查。os.cmd:CPU、内存、磁盘和 NTP 状态。
3.2 深度巡检¶
middleware.cmd:Redis、MinIO、ZooKeeper、Kafka、RabbitMQ、MySQL 等中间件状态。service.cmd:systemd 服务状态与journalctl时间窗口内错误日志。sh/process.sh:补充主机进程统计信息。
四、完整脚本¶
以下为本文对应的完整脚本,便于直接复制复用。
4.1 env.sh¶
export NTP_SERVER1=
export NTP_SERVER2=
export start_time=$(date -d "1 day ago" "+%Y-%m-%d 00:00:00")
export end_time=$(date -d "0 day ago" "+%Y-%m-%d 00:00:00")
export etcd_host1=10.232.22.71
export etcd_host2=10.232.22.72
export etcd_host3=10.232.22.73
export ansible_hosts=/home/cjs/paas_kz_k8s_v1.23.6_harbor_2.5.4_x86_64_20230412/hosts
4.2 run.sh¶
#bin/bash
export ansible_hosts=/home/cjs/paas_kz_k8s_v1.23.6_harbor_2.5.4_x86_64_20230412/hosts
copy_shdir() {
WORK_DIR=$(dirname $0)
ansible -i $ansible_hosts node -m copy -a "src=${WORK_DIR}/sh dest=/home/ecip/"
#ansible -i /apps/5-deploy/deploy/k8s/paas_kz_k8s_v1.23.6_x86_64_20220914/hosts node -m copy -a "src=${WORK_DIR}/sh dest=/home/admin/"
}
fmt() {
cat |
awk '
/^([0-9]{1,3}\.){3}[0-9]{1,3}[[:space:]]*\|/ { $1=sprintf("%-15s",$1);prefix=$0}
!/^([0-9]{1,3}\.){3}[0-9]{1,3}[[:space:]]*\|/ { print prefix"\t"$0 }
'
}
run() {
FILE=$1
time=$2
CMDS1=$(cat $FILE)
today=$(date "+%Y%m%d")
DATE=$(date "+%Y%m%d_%H%M%S")
time=$DATE
WORK_DIR=$(dirname $0)
mkdir -p $WORK_DIR/logs/$today
LOG=${WORK_DIR}/logs/$today/${FILE}.${time}.log
#LOG=${WORK_DIR}/logs/healthcheck.${time}.log
# 必须使用while read line; do ... done <<< "$CMDS1" 读入文件
# 不能使用echo "$CMDS1" | while read line; do ... done的形式,管道会启动子进程,exit时,只是退出了子进程,导致出错时主进程仍继续运行
while read -r line
do
# 空行,不做处理
echo $line | grep -E '^[[:space:]]*#*[[:space:]]*$'
[[ $? == 0 ]] && continue
# 获取ansible组名
NODE1=$(echo $line | grep -E '^[[:space:]]*\[[^]]*\][[:space:]]*$')
if [[ $? == 0 ]]; then
echo -e "\n\n$line"
NODE=$(echo $line | sed 's/\[\|\]//g')
continue
fi
if [[ x == x"$NODE" ]]; then
echo "ansible组名为空"
exit 1
fi
echo -e "\n" 2>&1 |
tee -a $LOG
echo "$line" 2>&1 |
tee -a $LOG
ansible -i $ansible_hosts $NODE -m shell -a "$line" |
fmt |
tee -a $LOG
done <<< "$CMDS1"
}
4.3 healthcheck.sh¶
#!/bin/bash
#首先配置需要检查日志的开始结束日期
cd $(dirname $0)
export start_time=$(date -d "1 day ago" "+%Y-%m-%d 00:00:00")
export end_time=$(date -d "0 day ago" "+%Y-%m-%d 00:00:00")
source ../env.sh
envsubst <service.cmd.tmpl >service.cmd
envsubst <etcd.cmd.tmpl >etcd.cmd
#clear log dir
#rm -f logs/*
. ../run.sh
#脚本目录同步到所有主机
copy_shdir
DATE=$(date "+%Y%m%d_%H%M%S")
#检查etcd
run etcd.cmd $DATE
#检查k8s
run k8s.cmd $DATE
#检查操作系统
run os.cmd $DATE
#检查企业承包套件(深度巡检使用)
run middleware.cmd $DATE
#检查系统服务(深度巡检使用)
run service.cmd $DATE
4.4 etcd.cmd¶
[kubectl]
etcdctl --cacert /etc/kubernetes/ssl/ca.crt --cert /etc/kubernetes/ssl/etcd_client.crt --key /etc/kubernetes/ssl/etcd_client.key --endpoints https://10.232.22.71:1159,https://10.232.22.72:1159,https://10.232.22.73:1159 endpoint status
etcdctl --cacert /etc/kubernetes/ssl/ca.crt --cert /etc/kubernetes/ssl/etcd_client.crt --key /etc/kubernetes/ssl/etcd_client.key --endpoints https://10.232.22.71:1159,https://10.232.22.72:1159,https://10.232.22.73:1159 member list
etcdctl --cacert /etc/kubernetes/ssl/ca.crt --cert /etc/kubernetes/ssl/etcd_client.crt --key /etc/kubernetes/ssl/etcd_client.key --endpoints https://10.232.22.71:1159,https://10.232.22.72:1159,https://10.232.22.73:1159 endpoint health
[master]
systemctl status etcd -l
4.5 etcd.cmd.tmpl¶
[kubectl]
etcdctl --cacert /etc/kubernetes/ssl/ca.crt --cert /etc/kubernetes/ssl/etcd_client.crt --key /etc/kubernetes/ssl/etcd_client.key --endpoints https://${etcd_host1}:1159,https://${etcd_host2}:1159,https://${etcd_host3}:1159 endpoint status
etcdctl --cacert /etc/kubernetes/ssl/ca.crt --cert /etc/kubernetes/ssl/etcd_client.crt --key /etc/kubernetes/ssl/etcd_client.key --endpoints https://${etcd_host1}:1159,https://${etcd_host2}:1159,https://${etcd_host3}:1159 member list
etcdctl --cacert /etc/kubernetes/ssl/ca.crt --cert /etc/kubernetes/ssl/etcd_client.crt --key /etc/kubernetes/ssl/etcd_client.key --endpoints https://${etcd_host1}:1159,https://${etcd_host2}:1159,https://${etcd_host3}:1159 endpoint health
[master]
systemctl status etcd -l
4.6 k8s.cmd¶
[master]
systemctl status etcd | grep Active:
systemctl status kube-scheduler | grep Active:
systemctl status kube-apiserver | grep Active:
systemctl status kube-controller-manager | grep Active:
systemctl status kubelet | grep Active:
[kubectl]
kubectl get node
kubectl get cs
kubectl get pod -A|grep calico
kubectl get po -A|grep dns
[node]
systemctl status kubelet | grep Active:
systemctl status kube-proxy | grep Active:
systemctl status docker | grep Active:
[harbor]
cd /apps/harbor_setup/harbor;docker-compose ps
docker ps -a| grep keepalive
4.7 middleware.cmd¶
[kubectl]
#检查redis
kubectl get po -A | grep redis | grep -vi running
kubectl -n paas-middleware get pods | grep devops-redis-master | awk '{print $1}' | xargs -i kubectl -n paas-middleware exec {} -- redis-cli -a \!8nKVyV3tU05dtfb info Replication
kubectl -n paas-middleware get pods |grep devops-redis-sentinel-1 | awk '{print $1}' |xargs -i kubectl -n paas-middleware exec {} -- redis-cli -p 26379 -a \!8nKVyV3tU05dtfb info sentinel
kubectl -n paas-middleware get pods |grep devops-redis-sentinel-2 | awk '{print $1}' |xargs -i kubectl -n paas-middleware exec {} -- redis-cli -p 26379 -a \!8nKVyV3tU05dtfb info sentinel
kubectl -n paas-middleware get pods |grep devops-redis-sentinel-3 | awk '{print $1}' |xargs -i kubectl -n paas-middleware exec {} -- redis-cli -p 26379 -a \!8nKVyV3tU05dtfb info sentinel
#检查minio
#kubectl -n paas-middleware exec minio-cluster-0 -- mc admin info mini0
kubectl get po -A | grep minio | grep -vi running
#检查zookeeper
kubectl get po -A | grep zk | grep -vi running
kubectl -n paas-middleware get pods | grep zk | awk '{print $1}' | xargs -i kubectl -n paas-middleware exec {} -- zkServer.sh status
#检查kafka
#kubectl -n paas-middleware get pods | grep zk | awk 'NR==1 {print $1}' | xargs -i kubectl -n paas-middleware exec {} -- zkCli.sh ls /brokers/ids/1
kubectl get po -A | grep kafka | grep -vi running
kubectl -n paas-middleware get pods | grep zk | awk '{print $1}' | xargs -i kubectl -n paas-middleware exec {} -- zkCli.sh ls /brokers/ids/1
#检查rabbitmq
kubectl -n paas-middleware get pods| grep "mq-" | grep -vi running
kubectl -n paas-middleware get pods | grep "mq-" | awk '{print $1}' | xargs -i kubectl -n paas-middleware exec {} -- /opt/rabbitmq/sbin/rabbitmqctl cluster_status -s
##检查mysql
kubectl -n paas-middleware get pods | grep mysql | grep -vi running
kubectl -n paas-middleware get pods | grep mysql-master| awk '{print $1}' |xargs -i kubectl -n paas-middleware exec {} -- mysql -uadmin -p59mkkVCrMj7T0lnS -e "show slave status\G;show status like 'Threads%';"
kubectl -n paas-middleware get pods | grep mysql-master |awk '{print $1}' | xargs -i kubectl -n paas-middleware logs --tail 10 {}
kubectl -n paas-middleware get pods | grep mysql-slave | awk '{print $1}' | xargs -i kubectl -n paas-middleware logs --tail 10 {}
4.8 os.cmd¶
[node]
top -bn1 | grep load | awk '{printf "CPU Load --- %.2f\n", $(NF-2)}'
free -g | awk 'NR==2{printf "Memory Usage --- %4s/%4s GB (%.2f%%)\n", $3,$2,$3*100/$2 }'
df -h | egrep '[5-9][0-9]%|100%' |awk '{printf "Disk Usage --- %s \t Disk --- %s\n", $5,$6}'
timedatectl | grep 'NTP synchronized'
[mysql]
mysql -uroot -p'59mkkVCrMj7T0lnS' -e "show slave status;"
4.9 service.cmd¶
[master]
systemctl status etcd | grep Active:
journalctl --since="2023-07-13 00:00:00" --until="2023-07-14 00:00:00" -u etcd | grep Error
ETCDCTL_API=3 etcdctl --cacert /etc/kubernetes/ssl/ca.crt --cert /etc/kubernetes/ssl/etcd_client.crt --key /etc/kubernetes/ssl/etcd_client.key --endpoints="10.232.22.71:1159,10.232.22.72:1159,10.232.22.73:1159" endpoint health
systemctl status kube-scheduler | grep Active:
journalctl --since="2023-07-13 00:00:00" --until="2023-07-14 00:00:00" -u kube-scheduler | grep Error
systemctl status kube-apiserver | grep Active:
journalctl --since="2023-07-13 00:00:00" --until="2023-07-14 00:00:00" -u kube-apiserver | grep Error
systemctl status kube-controller-manager | grep Active:
journalctl --since="2023-07-13 00:00:00" --until="2023-07-14 00:00:00" -u kube-controller-manager | grep Error
[node]
systemctl status kubelet | grep Active:
journalctl --since="2023-07-13 00:00:00" --until="2023-07-14 00:00:00" -u kubelet | grep Error
systemctl status kube-proxy | grep Active:
journalctl --since="2023-07-13 00:00:00" --until="2023-07-14 00:00:00" -u kube-proxy | grep Error
systemctl status docker | grep Active:
journalctl --since="2023-07-13 00:00:00" --until="2023-07-14 00:00:00" -u docker | grep Error
sh /home/ecip/sh/process.sh
4.10 service.cmd.tmpl¶
[master]
systemctl status etcd | grep Active:
journalctl --since="${start_time}" --until="${end_time}" -u etcd | grep Error
ETCDCTL_API=3 etcdctl --cacert /etc/kubernetes/ssl/ca.crt --cert /etc/kubernetes/ssl/etcd_client.crt --key /etc/kubernetes/ssl/etcd_client.key --endpoints="${etcd_host1}:1159,${etcd_host2}:1159,${etcd_host3}:1159" endpoint health
systemctl status kube-scheduler | grep Active:
journalctl --since="${start_time}" --until="${end_time}" -u kube-scheduler | grep Error
systemctl status kube-apiserver | grep Active:
journalctl --since="${start_time}" --until="${end_time}" -u kube-apiserver | grep Error
systemctl status kube-controller-manager | grep Active:
journalctl --since="${start_time}" --until="${end_time}" -u kube-controller-manager | grep Error
[node]
systemctl status kubelet | grep Active:
journalctl --since="${start_time}" --until="${end_time}" -u kubelet | grep Error
systemctl status kube-proxy | grep Active:
journalctl --since="${start_time}" --until="${end_time}" -u kube-proxy | grep Error
systemctl status docker | grep Active:
journalctl --since="${start_time}" --until="${end_time}" -u docker | grep Error
sh /home/ecip/sh/process.sh
4.11 process.sh¶
#!/bin/bash
#Author.nhx
running=0
sleeping=0
stoped=0
zombie=0
#在proc目录下所有以数字开始的都是当前计算机正在运行的进程的进程PID
#每个PID编号的目录下纪律有该进程相关的信息
for pid in /proc/[1-9]*
do
if [ -f $pid/stat ];then
procs=$[procs+1]
stat=$(awk '{print $3}' $pid/stat)
fi
#每个pid目录下都有一个stat文件,该文件的第3列是该进程的状态信息
case $stat in
R)
running=$[running+1];;
T)
stoped=$[stoped+1];;
S)
sleeping=$[sleeping+1];;
Z)
zombie=$[zombie+1];;
esac
done
echo "进程统计信息如下:进程数量为:$procs, Running 进程数为:$running,Stoped 进程为:$stoped,Sleeping 进程数为:$sleeping,Zombie 进程数为:$zombie"
#echo "进程统计信息如下"
#echo "总进程数量为:$procs"
#echo "Running 进程数为:$running"
#echo "Stoped 进程为:$stoped"
#echo "Sleeping 进程数为:$sleeping"
#echo "Zombie 进程数为:$zombie"