一、基于钉钉的报警媒介

自定义机器人安全设置 - 钉钉开放平台 (dingtalk.com)

Day07-容器云平台监控一体化-图12

创建自定义机器人 - 钉钉开放平台

image-20250412171946769

钉钉-机器人管理(复制生成的webhook)

查看生成的webhook:https://oapi.dingtalk.com/robot/send?access_token=5fddceb7c1a3169016bfcad7ae5e3412fd32a90e0ff919a8b480432c810fe4d3

1.1 dingtalk部署配置

模板文件

cat << EOF > dingtalk-webhook.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
  labels:
    run: dingtalk
  name: webhook-dingtalk
  namespace: monitor
spec:
  replicas: 1
  selector:
    matchLabels:
      run: dingtalk
  template:
    metadata:
      labels:
        run: dingtalk
    spec:
      containers:
      - name: dingtalk
        image: registry.cn-hangzhou.aliyuncs.com/abroad_images/prometheus-webhook-dingtalk:v1.4.0
        imagePullPolicy: IfNotPresent
        args:
          - --ding.profile=webhook1=https://oapi.dingtalk.com/robot/send?access_token=<替换成你的token>
        ports:
        - containerPort: 8060
          protocol: TCP
---
apiVersion: v1
kind: Service
metadata:
  labels:
    run: dingtalk
  name: webhook-dingtalk
  namespace: monitor
spec:
  ports:
  - port: 8060
    protocol: TCP
    targetPort: 8060
  selector:
    run: dingtalk
  sessionAffinity: None
EOF

完整文件

cat << EOF > dingtalk-webhook.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
  labels:
    run: dingtalk
  name: webhook-dingtalk
  namespace: monitor
spec:
  replicas: 1
  selector:
    matchLabels:
      run: dingtalk
  template:
    metadata:
      labels:
        run: dingtalk
    spec:
      containers:
      - name: dingtalk
        image: registry.cn-hangzhou.aliyuncs.com/abroad_images/prometheus-webhook-dingtalk:v1.4.0
        imagePullPolicy: IfNotPresent
        args:
          - --ding.profile=webhook1=https://oapi.dingtalk.com/robot/send?access_token=5fddceb7c1a3169016bfcad7ae5e3412fd32a90e0ff919a8b480432c810fe4d3
        ports:
        - containerPort: 8060
          protocol: TCP
---
apiVersion: v1
kind: Service
metadata:
  labels:
    run: dingtalk
  name: webhook-dingtalk
  namespace: monitor
spec:
  ports:
  - port: 8060
    protocol: TCP
    targetPort: 8060
  selector:
    run: dingtalk
  sessionAffinity: None
EOF

# 应用
[root@master01 7]# kaf dingtalk-webhook.yaml

1.2 配置alertmanager配置文件configmap

# 修改如下配置文件
    route:
      group_by: ['env','instance','type','group','job','alertname','cluster']
      group_wait: 10s
      group_interval: 2m
      repeat_interval: 10m
      receiver: 'email'
      routes:
      - receiver: 'wechat'
        match:
          severity: critical

      ##新增告警receiver通道
      - receiver: 'webhook'
        match:
          hostname: zhdya

    receivers:
    - name: 'email'
      email_configs:
      - to: 'zhxxx@163.com'
        send_resolved: true
        html: '{{ template "email.to.html" . }}'

    - name: 'wechat'
      wechat_configs:
      - corp_id: 'ww187a2xxxaececc4'
        to_party: '413'
        to_user: '@all'
        agent_id: 1000035
        api_secret: 'IVRfzG15S6lb5WRCq2-xxxoqFXSnBdY3fyocuDP-tc'
        send_resolved: true

    - name: 'webhook'           ## 配置接收告警的媒介
      webhook_configs:
      - url: 'http://webhook-dingtalk.monitor.svc.cluster.local:8060/dingtalk/webhook1/send'
        send_resolved: true

# 完整配置文件
[root@master01 7]# vim alertmanager-configmap-dingtalk.yaml
apiVersion: v1
kind: ConfigMap
metadata:
  name: alertmanager-config
  namespace: monitor
data:
  alertmanager.yml: |-
    global:
      resolve_timeout: 1m
      #邮箱服务器的SMTP主机配置,个人邮箱使用smtp.qq.com:465,企业邮箱使用smtp.exmail.qq.com:465
      smtp_smarthost: 'smtp.qq.com:465'
      #发送邮件主题
      smtp_from: 'wechat@wechat.cn'
      #登录用户名
      smtp_auth_username: '1904763431@qq.com'
      #此处的auth password是邮箱的第三方登录授权密码,而非用户密码
      smtp_auth_password: 'xdjdwczivdfpcbhj'
      #有些邮箱需要开启此配置,这里使用的是企微邮箱,仅做测试,不需要开启此功能。
      smtp_require_tls: false

    templates:
      - '/etc/alertmanager/*.tmpl'
    route:
      group_by: ['env','instance','type','group','job','alertname','cluster']
      group_wait: 10s
      group_interval: 2m
      repeat_interval: 10m
      receiver: 'email'
      routes:
      - receiver: 'wechat'
        match:
          severity: critical

      - receiver: 'webhook'
        match:
          hostname: zhdya

    receivers:
    - name: 'email'
      email_configs:
      - to: 'xiaozhang_vip123@163.com'
        send_resolved: true
        html: '{{ template "email.to.html" . }}'

    - name: 'wechat'
      wechat_configs:
      - corp_id: 'wwe897498df31cc026'
        to_party: '4'
        to_user: '@all'
        agent_id: 1000005
        api_secret: 'kqi1OyauWVz5HJvNSV268ORZQ8bANHlAV4l5j9BqWqc'
        send_resolved: true

    #配置接收告警的媒介
    - name: 'webhook'
      webhook_configs:
      - url: 'http://webhook-dingtalk.monitor.svc.cluster.local:8060/dingtalk/webhook1/send'
        send_resolved: true

    inhibit_rules:
      - source_match:
          severity: 'critical'
        target_match:
          severity: 'warning'
        equal: ['alertname', 'dev', 'instance']

  wechat.tmpl: |-
    {{ define "wechat.default.message" }}
    {{- if gt (len .Alerts.Firing) 0 -}}
    {{- range $index, $alert := .Alerts -}}
    {{- if eq $index 0 }}
    ========= 监控报警 =========
    告警状态:{{   .Status }}
    告警级别:{{ .Labels.severity }}
    告警类型:{{ $alert.Labels.alertname }}
    故障主机: {{ $alert.Labels.instance }}
    告警主题: {{ $alert.Annotations.summary }}
    告警详情: {{ $alert.Annotations.message }}{{ $alert.Annotations.description}};
    触发阀值:{{ .Annotations.value }}
    故障时间: {{ ($alert.StartsAt.Add 28800e9).Format "2006-01-02 15:04:05" }}
    ========= = end =  =========
    {{- end }}
    {{- end }}
    {{- end }}
    {{- if gt (len .Alerts.Resolved) 0 -}}
    {{- range $index, $alert := .Alerts -}}
    {{- if eq $index 0 }}
    ========= 告警恢复 =========
    告警类型:{{ .Labels.alertname }}
    告警状态:{{   .Status }}
    告警主题: {{ $alert.Annotations.summary }}
    告警详情: {{ $alert.Annotations.message }}{{ $alert.Annotations.description}};
    故障时间: {{ ($alert.StartsAt.Add 28800e9).Format "2006-01-02 15:04:05" }}
    恢复时间: {{ ($alert.EndsAt.Add 28800e9).Format "2006-01-02 15:04:05" }}
    {{- if gt (len $alert.Labels.instance) 0 }}
    实例信息: {{ $alert.Labels.instance }}
    {{- end }}
    ========= = end =  =========
    {{- end }}
    {{- end }}
    {{- end }}
    {{- end }}

  email.tmpl: |-
    {{ define "email.from" }}xxx.com{{ end }}
    {{ define "email.to" }}xxx.com{{ end }}
    {{ define "email.to.html" }}
    {{- if gt (len .Alerts.Firing) 0 -}}
    {{ range .Alerts }}
    ========= 监控报警 =========<br>
    告警程序: prometheus_alert <br>
    告警级别: {{ .Labels.severity }} <br>
    告警类型: {{ .Labels.alertname }} <br>
    告警主机: {{ .Labels.instance }} <br>
    告警主题: {{ .Annotations.summary }}  <br>
    告警详情: {{ .Annotations.description }} <br>
    触发时间: {{ .StartsAt.Format "2006-01-02 15:04:05" }} <br>
    ========= = end =  =========<br>
    {{ end }}{{ end -}}

    {{- if gt (len .Alerts.Resolved) 0 -}}
    {{ range .Alerts }}
    ========= 告警恢复 =========<br>
    告警程序: prometheus_alert <br>
    告警级别: {{ .Labels.severity }} <br>
    告警类型: {{ .Labels.alertname }} <br>
    告警主机: {{ .Labels.instance }} <br>
    告警主题: {{ .Annotations.summary }} <br>
    告警详情: {{ .Annotations.description }} <br>
    触发时间: {{ .StartsAt.Format "2006-01-02 15:04:05" }} <br>
    恢复时间: {{ .EndsAt.Format "2006-01-02 15:04:05" }} <br>
    ========= = end =  =========<br>
    {{ end }}{{ end -}}

    {{- end }}

#重新应用
[root@master01 7]# kaf alertmanager-configmap-dingtalk.yaml

#热加载
[root@master01 7]# curl -XPOST http://prometheus.zhang-qing.com/-/reload

测试验证:

#匹配如上webhook标签:hostname:zhdya
[root@master01 7]#
curl -XPOST -H 'Content-Type: application/json' \
  http://alertmanager.zhang-qing.com/api/v1/alerts \
  -d '[
    {
      "labels": {
        "hostname": "zhdya"
      },
      "annotations": {
        "summary": "[监控告警] 主机宕机告警测试",
        "description": "[监控告警] 主机 zhdya (other-ECS) 已宕机超过1分钟,请立即处理!"
      }
    }
  ]'

#回显内容
{"status":"success"}

image-20250412175842753

重新修改配置文件

# 修改配置文件,调换hostname: zhdya 和severity: critical位置
      routes:
      - receiver: 'wechat'
        match:
          hostname: zhdya

      - receiver: 'webhook'
        match:
          severity: critical

# 完整配置文件
[root@master01 7]# vim alertmanager-configmap-dingtalk.yaml
apiVersion: v1
kind: ConfigMap
metadata:
  name: alertmanager-config
  namespace: monitor
data:
  alertmanager.yml: |-
    global:
      resolve_timeout: 1m
      #邮箱服务器的SMTP主机配置,个人邮箱使用smtp.qq.com:465,企业邮箱使用smtp.exmail.qq.com:465
      smtp_smarthost: 'smtp.qq.com:465'
      #发送邮件主题
      smtp_from: 'wechat@wechat.cn'
      #登录用户名
      smtp_auth_username: '1904763431@qq.com'
      #此处的auth password是邮箱的第三方登录授权密码,而非用户密码
      smtp_auth_password: 'xdjdwczivdfpcbhj'
      #有些邮箱需要开启此配置,这里使用的是企微邮箱,仅做测试,不需要开启此功能。
      smtp_require_tls: false

    templates:
      - '/etc/alertmanager/*.tmpl'
    route:
      group_by: ['env','instance','type','group','job','alertname','cluster']
      group_wait: 10s
      group_interval: 2m
      repeat_interval: 10m
      receiver: 'email'
      routes:
      - receiver: 'wechat'
        match:
          hostname: zhdya

      - receiver: 'webhook'
        match:
          severity: critical

    receivers:
    - name: 'email'
      email_configs:
      - to: 'xiaozhang_vip123@163.com'
        send_resolved: true
        html: '{{ template "email.to.html" . }}'

    - name: 'wechat'
      wechat_configs:
      - corp_id: 'wwe897498df31cc026'
        to_party: '4'
        to_user: '@all'
        agent_id: 1000005
        api_secret: 'kqi1OyauWVz5HJvNSV268ORZQ8bANHlAV4l5j9BqWqc'
        send_resolved: true

    #配置接收告警的媒介
    - name: 'webhook'
      webhook_configs:
      - url: 'http://webhook-dingtalk.monitor.svc.cluster.local:8060/dingtalk/webhook1/send'
        send_resolved: true

    inhibit_rules:
      - source_match:
          severity: 'critical'
        target_match:
          severity: 'warning'
        equal: ['alertname', 'dev', 'instance']

  wechat.tmpl: |-
    {{ define "wechat.default.message" }}
    {{- if gt (len .Alerts.Firing) 0 -}}
    {{- range $index, $alert := .Alerts -}}
    {{- if eq $index 0 }}
    ========= 监控报警 =========
    告警状态:{{   .Status }}
    告警级别:{{ .Labels.severity }}
    告警类型:{{ $alert.Labels.alertname }}
    故障主机: {{ $alert.Labels.instance }}
    告警主题: {{ $alert.Annotations.summary }}
    告警详情: {{ $alert.Annotations.message }}{{ $alert.Annotations.description}};
    触发阀值:{{ .Annotations.value }}
    故障时间: {{ ($alert.StartsAt.Add 28800e9).Format "2006-01-02 15:04:05" }}
    ========= = end =  =========
    {{- end }}
    {{- end }}
    {{- end }}
    {{- if gt (len .Alerts.Resolved) 0 -}}
    {{- range $index, $alert := .Alerts -}}
    {{- if eq $index 0 }}
    ========= 告警恢复 =========
    告警类型:{{ .Labels.alertname }}
    告警状态:{{   .Status }}
    告警主题: {{ $alert.Annotations.summary }}
    告警详情: {{ $alert.Annotations.message }}{{ $alert.Annotations.description}};
    故障时间: {{ ($alert.StartsAt.Add 28800e9).Format "2006-01-02 15:04:05" }}
    恢复时间: {{ ($alert.EndsAt.Add 28800e9).Format "2006-01-02 15:04:05" }}
    {{- if gt (len $alert.Labels.instance) 0 }}
    实例信息: {{ $alert.Labels.instance }}
    {{- end }}
    ========= = end =  =========
    {{- end }}
    {{- end }}
    {{- end }}
    {{- end }}

  email.tmpl: |-
    {{ define "email.from" }}xxx.com{{ end }}
    {{ define "email.to" }}xxx.com{{ end }}
    {{ define "email.to.html" }}
    {{- if gt (len .Alerts.Firing) 0 -}}
    {{ range .Alerts }}
    ========= 监控报警 =========<br>
    告警程序: prometheus_alert <br>
    告警级别: {{ .Labels.severity }} <br>
    告警类型: {{ .Labels.alertname }} <br>
    告警主机: {{ .Labels.instance }} <br>
    告警主题: {{ .Annotations.summary }}  <br>
    告警详情: {{ .Annotations.description }} <br>
    触发时间: {{ .StartsAt.Format "2006-01-02 15:04:05" }} <br>
    ========= = end =  =========<br>
    {{ end }}{{ end -}}

    {{- if gt (len .Alerts.Resolved) 0 -}}
    {{ range .Alerts }}
    ========= 告警恢复 =========<br>
    告警程序: prometheus_alert <br>
    告警级别: {{ .Labels.severity }} <br>
    告警类型: {{ .Labels.alertname }} <br>
    告警主机: {{ .Labels.instance }} <br>
    告警主题: {{ .Annotations.summary }} <br>
    告警详情: {{ .Annotations.description }} <br>
    触发时间: {{ .StartsAt.Format "2006-01-02 15:04:05" }} <br>
    恢复时间: {{ .EndsAt.Format "2006-01-02 15:04:05" }} <br>
    ========= = end =  =========<br>
    {{ end }}{{ end -}}

    {{- end }}

#重新应用
[root@master01 7]# kaf alertmanager-configmap-dingtalk.yaml

#热加载
[root@master01 7]# curl -XPOST http://prometheus.zhang-qing.com/-/reload

#切记修改钉钉机器人的自定义关键词为critical,要不然不会发生告警

#关掉node-exporter的容器
[root@iZ2zei8khbswwoi09foupeZ ~]# docker ps | grep node
726b455c0f35   registry.cn-hangzhou.aliyuncs.com/abroad_images/node-exporter:latest      "/bin/node_exporter …"   19 hours ago   Up 19 hours   0.0.0.0:9100->9100/tcp, [::]:9100->9100/tcp   flamboyant_mccarthy
[root@iZ2zei8khbswwoi09foupeZ ~]# docker stop flamboyant_mccarthy

image-20250412192717482

二、告警静默

2.1 什么是告警静默

静默 Silences :指让通过设置让警报在指定时间暂时不会发送警报的一种方式。

  • 用于解决严重生产故障问题时,因所花费的时间过长,通过静默设置避免接收到过多的无用通知;
  • 在已知的例行维护中,为了防止对例行维护的机器发送不必要的警报;

2.2 如何设置临时静默

Day07-容器云平台监控一体化-图13

三、总结

  • 实时告警通知:企业微信/钉钉等即时通信工具能够实现实时的告警通知,使得团队 成员能够及时响应和解决问题。
  • 告警抑制:对已知或排查问题的时候进行告警静默