一、为什么需要自定义告警?

  • 1、满足不同的监控需求;
  • 2、结合链路追踪及告警规则,更高效解决问题;

二、场景定义

2.1 需求

场景描述:公司主营业务为在线购物网站,那么 HTTP 服务的可用性就非常重要。如果 某个服务出现故障,则可能会导致用户无法访问网站,从而影响用户体验和业务收入。

因此,需要借助 SkyWalking 自定义告警功能来监控异常 HTTP 状态码,及时发现和解 决问题,提高服务质量和系统可用性。

2.2 需求分析

当接口返回状态码为 404,500, 502, 503, 504 其中一个,就发送告警。

如果要添加自定义告警,首先需要在 oal 文件中添加一个指标。

Helm包文件目录: skywalking/files/conf.d 下面有个 README.md 文件,主要是为了介绍如何去自定义

[root@master01 8]# cat skywalking/files/conf.d/README.md
If you don't want to use the default configuration files packed into the Docker image,
put your own configuration files under this directory in the corresponding component subdirectory,
`oap`, `ui`, etc.

Files under `oap/*` will override the counterparts under the Docker image's `/skywalking/config/*`, with the directory structure retained, here are some examples:

| File under `files/config.d/oap` directory | Overrides the file under Docker image's `/skywalking/config/` |
| ---- | -------- |
| `files/config.d/oap/application.yml`                 | `/skywalking/config/application.yml`                  |
| `files/config.d/oap/log4j2.xml`                      | `/skywalking/config/log4j2.xml`                       |
| `files/config.d/oap/alarm-settings.yml`              | `/skywalking/config/alarm-settings.yml`               |
| `files/config.d/oap/endpoint-name-grouping.yml`      | `/skywalking/config/endpoint-name-grouping.yml`       |
| `files/config.d/oap/oal/core.oal`                    | `/skywalking/config/oal/core.oal`                     |
| `files/config.d/oap/oal/browser.oal`                 | `/skywalking/config/oal/browser.oal`                  |
| `files/config.d/oap/oc-rules/oap.yaml`               | `/skywalking/config/oc-rules/oap.yaml`                |
| `...`                                                | `...`                                                 |

Files under `satellite/*` will override the counterparts under the Docker image's `/skywalking/configs/*`, with the directory structure retained, here are some examples:

| File under `files/config.d/satellite` directory | Overrides the file under Docker image's `/skywalking/configs/` |
| ---- | -------- |
| `files/config.d/satellite/satellite_config.yaml` | `/skywalking/configs/satellite_config.yaml`  |
| `...`                                            | `...`                                        |

2.3 修改 core. oal

复制早先的所有内容,按照如上规则 files/config.d/oap/oal/core.oal 进行新告警添加。

Ps:如有es-init报错可先忽略;

[root@master01 ~]# cd /root/8/skywalking/files/conf.d
[root@master01 conf.d]# mkdir -p oap/oal
[root@master01 conf.d]# cd oap/oal/
[root@master01 oal]# vim core.oal
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 */

// For services using protocols HTTP 1/2, gRPC, RPC, etc., the cpm metrics means "calls per minute",
// for services that are built on top of TCP, the cpm means "packages per minute".

// All scope metrics
all_percentile = from(All.latency).percentile(10);  // Multiple values including p50, p75, p90, p95, p99
all_heatmap = from(All.latency).histogram(100, 20);

// Service scope metrics
service_resp_time = from(Service.latency).longAvg();
service_sla = from(Service.*).percent(status == true);
service_cpm = from(Service.*).cpm();
service_percentile = from(Service.latency).percentile(10); // Multiple values including p50, p75, p90, p95, p99
service_apdex = from(Service.latency).apdex(name, status);
service_mq_consume_count = from(Service.*).filter(type == RequestType.MQ).count();
service_mq_consume_latency = from((str->long)Service.tag["transmission.latency"]).filter(type == RequestType.MQ).filter(tag["transmission.latency"] != null).longAvg();

// Service relation scope metrics for topology
service_relation_client_cpm = from(ServiceRelation.*).filter(detectPoint == DetectPoint.CLIENT).cpm();
service_relation_server_cpm = from(ServiceRelation.*).filter(detectPoint == DetectPoint.SERVER).cpm();
service_relation_client_call_sla = from(ServiceRelation.*).filter(detectPoint == DetectPoint.CLIENT).percent(status == true);
service_relation_server_call_sla = from(ServiceRelation.*).filter(detectPoint == DetectPoint.SERVER).percent(status == true);
service_relation_client_resp_time = from(ServiceRelation.latency).filter(detectPoint == DetectPoint.CLIENT).longAvg();
service_relation_server_resp_time = from(ServiceRelation.latency).filter(detectPoint == DetectPoint.SERVER).longAvg();
service_relation_client_percentile = from(ServiceRelation.latency).filter(detectPoint == DetectPoint.CLIENT).percentile(10); // Multiple values including p50, p75, p90, p95, p99
service_relation_server_percentile = from(ServiceRelation.latency).filter(detectPoint == DetectPoint.SERVER).percentile(10); // Multiple values including p50, p75, p90, p95, p99

// Service Instance relation scope metrics for topology
service_instance_relation_client_cpm = from(ServiceInstanceRelation.*).filter(detectPoint == DetectPoint.CLIENT).cpm();
service_instance_relation_server_cpm = from(ServiceInstanceRelation.*).filter(detectPoint == DetectPoint.SERVER).cpm();
service_instance_relation_client_call_sla = from(ServiceInstanceRelation.*).filter(detectPoint == DetectPoint.CLIENT).percent(status == true);
service_instance_relation_server_call_sla = from(ServiceInstanceRelation.*).filter(detectPoint == DetectPoint.SERVER).percent(status == true);
service_instance_relation_client_resp_time = from(ServiceInstanceRelation.latency).filter(detectPoint == DetectPoint.CLIENT).longAvg();
service_instance_relation_server_resp_time = from(ServiceInstanceRelation.latency).filter(detectPoint == DetectPoint.SERVER).longAvg();
service_instance_relation_client_percentile = from(ServiceInstanceRelation.latency).filter(detectPoint == DetectPoint.CLIENT).percentile(10); // Multiple values including p50, p75, p90, p95, p99
service_instance_relation_server_percentile = from(ServiceInstanceRelation.latency).filter(detectPoint == DetectPoint.SERVER).percentile(10); // Multiple values including p50, p75, p90, p95, p99

// Service Instance Scope metrics
service_instance_sla = from(ServiceInstance.*).percent(status == true);
service_instance_resp_time= from(ServiceInstance.latency).longAvg();
service_instance_cpm = from(ServiceInstance.*).cpm();

// Endpoint scope metrics
endpoint_cpm = from(Endpoint.*).cpm();
endpoint_avg = from(Endpoint.latency).longAvg();
endpoint_sla = from(Endpoint.*).percent(status == true);
endpoint_percentile = from(Endpoint.latency).percentile(10); // Multiple values including p50, p75, p90, p95, p99
endpoint_mq_consume_count = from(Endpoint.*).filter(type == RequestType.MQ).count();
endpoint_mq_consume_latency = from((str->long)Endpoint.tag["transmission.latency"]).filter(type == RequestType.MQ).filter(tag["transmission.latency"] != null).longAvg();

// Endpoint relation scope metrics
endpoint_relation_cpm = from(EndpointRelation.*).filter(detectPoint == DetectPoint.SERVER).cpm();
endpoint_relation_resp_time = from(EndpointRelation.rpcLatency).filter(detectPoint == DetectPoint.SERVER).longAvg();
endpoint_relation_sla = from(EndpointRelation.*).filter(detectPoint == DetectPoint.SERVER).percent(status == true);
endpoint_relation_percentile = from(EndpointRelation.rpcLatency).filter(detectPoint == DetectPoint.SERVER).percentile(10); // Multiple values including p50, p75, p90, p95, p99

database_access_resp_time = from(DatabaseAccess.latency).longAvg();
database_access_sla = from(DatabaseAccess.*).percent(status == true);
database_access_cpm = from(DatabaseAccess.*).cpm();
database_access_percentile = from(DatabaseAccess.latency).percentile(10);

// zhdya 20230625
endpoint_abnormal = from(Endpoint.*).filter(responseCode in [404, 500, 502, 503, 504]).count();

更新部署

# 更新
[root@master01 ~]# cd /root/8
[root@master01 8]# helm upgrade skywalking skywalking -n devops --values ./skywalking/values.yaml

2.4 验证

# 查看运行中的pod
[root@master01 8]# kgp -n devops | grep oap
skywalking-oap-5554b6699f-h2j68   1/1     Running     0              45s

# 验证查询
[root@master01 8]# kubectl exec -it skywalking-oap-5554b6699f-h2j68  -ndevops -- cat /skywalking/config/oal/core.oal |grep -C 3 zhdya
Defaulted container "oap" out of: oap, wait-for-elasticsearch (init)
// zhdya 20230625
endpoint_abnormal = from(Endpoint.*).filter(responseCode in [404, 500, 502, 503, 504]).count();

三、增加告警rules

修改helm文件中的rules配置,新增如下rules

# 添加如下内容
[root@master01 templates]# cd /root/8/skywalking/templates/
[root@master01 templates]# vim oap-configmap.yaml

     endpoint_abnormal_rule:
        metrics-name: endpoint_abnormal
        threshold: 1
        op: ">="
        period: 2
        count: 1
        message: 接口:{name}\n 指标:接口异常\n 详情:最近2分钟内至少1次\n

# 完整配置文件
[root@master01 templates]# cd /root/8/skywalking/templates/
[root@master01 templates]# vim oap-configmap.yaml
{{- if .Values.oap.dynamicConfigEnabled }}
apiVersion: v1
kind: ConfigMap
metadata:
  name: skywalking-dynamic-config
  labels:
    app: {{ template "skywalking.name" . }}
    release: {{ .Release.Name }}
    component: {{ .Values.oap.name }}
data:
  alarm.default.alarm-settings: |-
    rules:
      # Rule unique name, must be ended with `_rule`.
      service_resp_time_rule:
        metrics-name: service_resp_time
        op: ">"
        threshold: 2000
        period: 10
        count: 3
        silence-period: 5
        message: 服务:{name}\n 指标:响应时间\n 详情:至少3次超过2秒(最近10分钟内)
      service_sla_rule:
        # Metrics value need to be long, double or int
        metrics-name: service_sla
        op: "<"
        threshold: 2000
        # The length of time to evaluate the metrics
        period: 10
        # How many times after the metrics match the condition, will trigger alarm
        count: 3
        # How many times of checks, the alarm keeps silence after alarm triggered, default as same as period.
        silence-period: 3
        message: 服务:{name}\n 指标:成功率\n 详情:至少3次低于80%(最近10分钟内)
      service_resp_time_percentile_rule:
        # Metrics value need to be long, double or int
        metrics-name: service_percentile
        op: ">"
        threshold: 1000,1000,1000,1000,1000
        period: 10
        count: 2
        silence-period: 5
        message: 服务:{name}\n 指标:响应时间\n 详情:至少3次百分位超过1秒(最近10分钟内)
      service_instance_resp_time_rule:
        metrics-name: service_instance_resp_time
        op: ">"
        threshold: 2000
        period: 10
        count: 2
        silence-period: 5
        message: 实例:{name}\n 指标:响应时间\n 详情:至少2次超过2秒(最近10分钟内)
      database_access_resp_time_rule:
        metrics-name: database_access_resp_time
        threshold: 2000
        op: ">"
        period: 10
        count: 2
        # message: Response time of database access {name} is more than 1000ms in 2 minutes of last 10 minutes
        message: 数据库访问:{name}\n 指标:响应时间\n 详情:至少2次超过2秒(最近10分钟内)
      endpoint_relation_resp_time_rule:
        metrics-name: endpoint_relation_resp_time
        threshold: 2000
        op: ">"
        period: 10
        count: 2
        message: 端点关系:{name}\n 指标:响应时间\n 详情:至少2次超过2秒(最近10分钟内)
      instance_jvm_old_gc_count_rule:
        metrics-name: instance_jvm_old_gc_count
        threshold: 1
        op: ">"
        period: 3
        count: 1
        message: 实例:{name}\n 指标:OldGC次数\n 详情:最近1天内大于1次
      instance_jvm_young_gc_count_rule:
        metrics-name: instance_jvm_young_gc_count
        threshold: 1
        op: ">"
        period: 5
        count: 100
        message: 实例:{name}\n 指标:YoungGC次数\n 详情:最近5分钟内大于100次
      # ============== 新增的接口异常告警规则 ==============
      endpoint_abnormal_rule:
        metrics-name: endpoint_abnormal
        threshold: 1
        op: ">="
        period: 2
        count: 1
        message: 接口:{name}\n 指标:接口异常\n 详情:最近2分钟内至少1次
      # =============================================
    wechatHooks:
      textTemplate: |-
        {
          "msgtype": "text",
          "text": {
            "content": "SkyWalking 链路追踪告警: \n %s."
          }
        }
      webhooks:
        - https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=9d8866d6-ab55-48f3-8336-786325667640
{{- end }}