skywalking链路追踪增加告警配置 - SRE回忆录

共计 9825 个字符，预计需要花费 25 分钟才能阅读完成。

前面博主已将商品中心服务接入到skywalking，实现了链路追踪功能。而在运维过程中，我们还需要配置监控来触发告警，让故障信息尽快通知到相关人员进行分析，所以这里我们就给我们的服务加上监控和告警配置

告警指标

对域skywalking的告警指标，默认路径在skywalking服务的config/oal/core.oal内

root@8e2113b4496c:/skywalking# cat config/oal/core.oal
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 */

// For services using protocols HTTP 1/2, gRPC, RPC, etc., the cpm metrics means "calls per minute",
// for services that are built on top of TCP, the cpm means "packages per minute".

// Service scope metrics
service_resp_time = from(Service.latency).longAvg();
service_sla = from(Service.*).percent(status == true);
service_cpm = from(Service.*).cpm();
service_percentile = from(Service.latency).percentile(10); // Multiple values including p50, p75, p90, p95, p99
service_apdex = from(Service.latency).apdex(name, status);
service_mq_consume_count = from(Service.*).filter(type == RequestType.MQ).count();
service_mq_consume_latency = from((str->long)Service.tag["transmission.latency"]).filter(type == RequestType.MQ).filter(tag["transmission.latency"] != null).longAvg();

// Service relation scope metrics for topology
service_relation_client_cpm = from(ServiceRelation.*).filter(detectPoint == DetectPoint.CLIENT).cpm();
service_relation_server_cpm = from(ServiceRelation.*).filter(detectPoint == DetectPoint.SERVER).cpm();
service_relation_client_call_sla = from(ServiceRelation.*).filter(detectPoint == DetectPoint.CLIENT).percent(status == true);
service_relation_server_call_sla = from(ServiceRelation.*).filter(detectPoint == DetectPoint.SERVER).percent(status == true);
service_relation_client_resp_time = from(ServiceRelation.latency).filter(detectPoint == DetectPoint.CLIENT).longAvg();
service_relation_server_resp_time = from(ServiceRelation.latency).filter(detectPoint == DetectPoint.SERVER).longAvg();
service_relation_client_percentile = from(ServiceRelation.latency).filter(detectPoint == DetectPoint.CLIENT).percentile(10); // Multiple values including p50, p75, p90, p95, p99
service_relation_server_percentile = from(ServiceRelation.latency).filter(detectPoint == DetectPoint.SERVER).percentile(10); // Multiple values including p50, p75, p90, p95, p99

// Service Instance relation scope metrics for topology
service_instance_relation_client_cpm = from(ServiceInstanceRelation.*).filter(detectPoint == DetectPoint.CLIENT).cpm();
service_instance_relation_server_cpm = from(ServiceInstanceRelation.*).filter(detectPoint == DetectPoint.SERVER).cpm();
service_instance_relation_client_call_sla = from(ServiceInstanceRelation.*).filter(detectPoint == DetectPoint.CLIENT).percent(status == true);
service_instance_relation_server_call_sla = from(ServiceInstanceRelation.*).filter(detectPoint == DetectPoint.SERVER).percent(status == true);
service_instance_relation_client_resp_time = from(ServiceInstanceRelation.latency).filter(detectPoint == DetectPoint.CLIENT).longAvg();
service_instance_relation_server_resp_time = from(ServiceInstanceRelation.latency).filter(detectPoint == DetectPoint.SERVER).longAvg();
service_instance_relation_client_percentile = from(ServiceInstanceRelation.latency).filter(detectPoint == DetectPoint.CLIENT).percentile(10); // Multiple values including p50, p75, p90, p95, p99
service_instance_relation_server_percentile = from(ServiceInstanceRelation.latency).filter(detectPoint == DetectPoint.SERVER).percentile(10); // Multiple values including p50, p75, p90, p95, p99

// Service Instance Scope metrics
service_instance_sla = from(ServiceInstance.*).percent(status == true);
service_instance_resp_time= from(ServiceInstance.latency).longAvg();
service_instance_cpm = from(ServiceInstance.*).cpm();

// Endpoint scope metrics
endpoint_cpm = from(Endpoint.*).cpm();
endpoint_resp_time = from(Endpoint.latency).longAvg();
endpoint_sla = from(Endpoint.*).percent(status == true);
endpoint_percentile = from(Endpoint.latency).percentile(10); // Multiple values including p50, p75, p90, p95, p99
endpoint_mq_consume_count = from(Endpoint.*).filter(type == RequestType.MQ).count();
endpoint_mq_consume_latency = from((str->long)Endpoint.tag["transmission.latency"]).filter(type == RequestType.MQ).filter(tag["transmission.latency"] != null).longAvg();

// Endpoint relation scope metrics
endpoint_relation_cpm = from(EndpointRelation.*).filter(detectPoint == DetectPoint.SERVER).cpm();
endpoint_relation_resp_time = from(EndpointRelation.rpcLatency).filter(detectPoint == DetectPoint.SERVER).longAvg();
endpoint_relation_sla = from(EndpointRelation.*).filter(detectPoint == DetectPoint.SERVER).percent(status == true);
endpoint_relation_percentile = from(EndpointRelation.rpcLatency).filter(detectPoint == DetectPoint.SERVER).percentile(10); // Multiple values including p50, p75, p90, p95, p99

database_access_resp_time = from(DatabaseAccess.latency).longAvg();
database_access_sla = from(DatabaseAccess.*).percent(status == true);
database_access_cpm = from(DatabaseAccess.*).cpm();
database_access_percentile = from(DatabaseAccess.latency).percentile(10);

service_resp_time #服务的响应时间
service_sla #服务的http请求成功率SLA,比如99%等。
service_cpm #表示每分钟的吞吐量.
service_apdex : 应用性能指数是0.8是0.x
service_percentile: 指定最近多少数据范围内的响应时间百分比,即p99, p95, p90, p75, p50在内的数据统计结果
endpoint_relation_cpm #端点的每分钟的吞吐量
endpoint_relation_resp_time #端点的响应时间
endpoint_relation_sla #端点的http请求成功率SLA,比如99%等。
endpoint_relation_percentile ##端点的最近多少数据范围内的响应时间百分比,即p99、 p95、 p90、 p75、p50在内的数据统计结果

如果指标不满足自己的业务的需求，可以参考上面去定制

告警规则

skywalking的告警配置文件为config/alarm-settings.yml

普通告警

规则名称：在告警信息中显示的唯一名称，必须以_rule结尾。
metrics-name：度量名称，也是OAL脚本中的度量名。默认配置中可以用于告警的度量有：服务，实例，端点，服务关系，实例关系，端点关系。它只支持long,double和int类型。
include-names：包含在此规则之内的实体名称列表。
exclude-names：排除在此规则以外的实体名称列表。
include-names-regex：提供一个正则表达式来包含实体名称。如果同时设置包含名称列表和包含名称的正则表达式，则两个规则都将生效。
exclude-names-regex：提供一个正则表达式来排除实体名称。如果同时设置排除名称列表和排除名称的正则表达式，则两个规则都将生效。
include-labels：包含在此规则之内的标签。
exclude-labels：排除在此规则以外的标签。
include-labels-regex：提供一个正则表达式来包含标签。如果同时设置包含标签列表和包含标签的正则表达式，则两个规则都将生效。
exclude-labels-regex：提供一个正则表达式来排除标签。如果同时设置排除标签列表和排除标签的正则表达式，则两个规则都将生效。

threshold：阈值。对于多个值指标，例如percentile，阈值是一个数组。像value1 value2 value3 value4 value5这样描述。每个值可以作为度量中每个值的阈值。如果不想通过此值或某些值触发警报，则将值设置为 -。例如在percentile中，value1是P50的阈值，value2是P75的阈值，那么-，-，value3, value4, value5的意思是，没有阈值的P50和P75的percentile告警规则。

op：操作符，支持>, >=, <, <=, =。
period：多久告警规则需要被检查一下。这是一个时间窗口，与后端部署环境时间相匹配。
count：在一个周期窗口中，如果按op计算超过阈值的次数达到count，则发送告警。
only-as-condition：true或者false，指定规则是否可以发送告警，或者仅作为复合规则的条件。
silence-period：在时间N中触发报警后，在N -> N + silence-period这段时间内不告警。默认情况下，它和period一样，这意味着相同的告警（同一个度量名称拥有相同的Id）在同一个周期内只会触发一次。
message：该规则触发时，发送的通知消息。

下面是两条告警示例：

服务在2分钟内调用次数大于1，一次就触发，触发告警后静默2分钟
服务响应时间在2分钟内超过100ms，一次就触发，触发告警后静默2分钟

root@8e2113b4496c:/skywalking# cat config/alarm-settings.yml
rules:
  service_cpm_rule:
    # 服务调用次数
    metrics-name: service_cpm
    op: ">"
    threshold: 1
    period: 2
    count: 1
    silence-period: 2
    message: 服务 {name} 访问次数大于1
  # Rule unique name, must be ended with `_rule`.
  service_resp_time_rule:
    metrics-name: service_resp_time
    op: ">"
    threshold: 100
    period: 2
    count: 1
    silence-period: 2
    message: Response time of service {name} is more than 100ms in last 2 minutes.
# webhook

官方的一些样例规则

rules:
  # Rule unique name, must be ended with `_rule`.
  endpoint_percent_rule:
    # Metrics value need to be long, double or int
    metrics-name: endpoint_percent
    threshold: 75
    op: <
    # The length of time to evaluate the metrics
    period: 10
    # How many times after the metrics match the condition, will trigger alarm
    count: 3
    # How many times of checks, the alarm keeps silence after alarm triggered, default as same as period.
    silence-period: 10
    # Specify if the rule can send notification or just as an condition of composite rule
    only-as-condition: false
    tags:
      level: WARNING
  service_percent_rule:
    metrics-name: service_percent
    # [Optional] Default, match all services in this metrics
    include-names:
      - service_a
      - service_b
    exclude-names:
      - service_c
    # Single value metrics threshold.
    threshold: 85
    op: <
    period: 10
    count: 4
    only-as-condition: false
  service_resp_time_percentile_rule:
    # Metrics value need to be long, double or int
    metrics-name: service_percentile
    op: ">"
    # Multiple value metrics threshold. Thresholds for P50, P75, P90, P95, P99.
    threshold: 1000,1000,1000,1000,1000
    period: 10
    count: 3
    silence-period: 5
    message: Percentile response time of service {name} alarm in 3 minutes of last 10 minutes, due to more than one condition of p50 > 1000, p75 > 1000, p90 > 1000, p95 > 1000, p99 > 1000
    only-as-condition: false
  meter_service_status_code_rule:
    metrics-name: meter_status_code
    exclude-labels:
      - "200"
    op: ">"
    threshold: 10
    period: 10
    count: 3
    silence-period: 5
    message: The request number of entity {name} non-200 status is more than expected.
    only-as-condition: false

读者自行根据上面的用法编写合适业务的告警监控规则，注意： endpoint 规则，相比 service、instance 规则耗费更多内存及资源~

复合规则

就是将多个规则进行判断：

composite-rules:
  comp_rule:
    # Must satisfied percent rule and resp time rule 
    expression: service_percent_rule && service_resp_time_percentile_rule
    message: Service {name} successful rate is less than 80% and P50 of response time is over 1000ms
    tags:
      level: CRITICAL

规则名称：在告警信息中显示的唯一名称，必须以_rule结尾
expression：指定如何组成规则，支持&&, ||, ()操作符
message：该规则触发时，发送的通知消息

告警通知

钉钉告警

root@8e2113b4496c:/skywalking# cat config/alarm-settings.yml
# 略

dingtalkHooks:
  textTemplate: |-
    {
      "msgtype": "text",
      "text": {
        "content": "Apache SkyWalking Alarm: \n %s."
      }
    }
  webhooks:
    - url: https://oapi.dingtalk.com/robot/send?access_token=ceb8f51dddddddddddaf640cae1db92
      secret: SEC80939ddddddddd21bcb6e2652f

配置完规则重启服务，测试下接口访问

在告警或事件中可以看到触发信息

回到钉钉群中可以看到告警信息以推送过来

微信告警

微信告警配置样例

wechatHooks:
  textTemplate: |-
    {
      "msgtype": "text",
      "text": {
        "content": "Apache SkyWalking Alarm: \n %s."
      }
    }
  webhooks:
    - https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=dummy_key

飞书告警

飞书告警样例配置

feishuHooks:
  textTemplate: |-
    {
      "msg_type": "text",
      "content": {
        "text": "Apache SkyWalking Alarm: \n %s."
      },
      "ats":"feishu_user_id_1,feishu_user_id_2"
    }
  webhooks:
    - url: https://open.feishu.cn/open-apis/bot/v2/hook/dummy_token
      secret: dummysecret

参考文档

https://github.com/apache/skywalking/blob/master/docs/en/setup/backend/backend-alarm.md