skywalking链路追踪增加告警配置

2,702次阅读
没有评论

共计 9825 个字符,预计需要花费 25 分钟才能阅读完成。

skywalking链路追踪增加告警配置

前面博主已将商品中心服务接入到skywalking,实现了链路追踪功能。而在运维过程中,我们还需要配置监控来触发告警,让故障信息尽快通知到相关人员进行分析,所以这里我们就给我们的服务加上监控和告警配置

告警指标

对域skywalking的告警指标,默认路径在skywalking服务的config/oal/core.oal内

root@8e2113b4496c:/skywalking# cat config/oal/core.oal
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 */

// For services using protocols HTTP 1/2, gRPC, RPC, etc., the cpm metrics means "calls per minute",
// for services that are built on top of TCP, the cpm means "packages per minute".

// Service scope metrics
service_resp_time = from(Service.latency).longAvg();
service_sla = from(Service.*).percent(status == true);
service_cpm = from(Service.*).cpm();
service_percentile = from(Service.latency).percentile(10); // Multiple values including p50, p75, p90, p95, p99
service_apdex = from(Service.latency).apdex(name, status);
service_mq_consume_count = from(Service.*).filter(type == RequestType.MQ).count();
service_mq_consume_latency = from((str->long)Service.tag["transmission.latency"]).filter(type == RequestType.MQ).filter(tag["transmission.latency"] != null).longAvg();

// Service relation scope metrics for topology
service_relation_client_cpm = from(ServiceRelation.*).filter(detectPoint == DetectPoint.CLIENT).cpm();
service_relation_server_cpm = from(ServiceRelation.*).filter(detectPoint == DetectPoint.SERVER).cpm();
service_relation_client_call_sla = from(ServiceRelation.*).filter(detectPoint == DetectPoint.CLIENT).percent(status == true);
service_relation_server_call_sla = from(ServiceRelation.*).filter(detectPoint == DetectPoint.SERVER).percent(status == true);
service_relation_client_resp_time = from(ServiceRelation.latency).filter(detectPoint == DetectPoint.CLIENT).longAvg();
service_relation_server_resp_time = from(ServiceRelation.latency).filter(detectPoint == DetectPoint.SERVER).longAvg();
service_relation_client_percentile = from(ServiceRelation.latency).filter(detectPoint == DetectPoint.CLIENT).percentile(10); // Multiple values including p50, p75, p90, p95, p99
service_relation_server_percentile = from(ServiceRelation.latency).filter(detectPoint == DetectPoint.SERVER).percentile(10); // Multiple values including p50, p75, p90, p95, p99

// Service Instance relation scope metrics for topology
service_instance_relation_client_cpm = from(ServiceInstanceRelation.*).filter(detectPoint == DetectPoint.CLIENT).cpm();
service_instance_relation_server_cpm = from(ServiceInstanceRelation.*).filter(detectPoint == DetectPoint.SERVER).cpm();
service_instance_relation_client_call_sla = from(ServiceInstanceRelation.*).filter(detectPoint == DetectPoint.CLIENT).percent(status == true);
service_instance_relation_server_call_sla = from(ServiceInstanceRelation.*).filter(detectPoint == DetectPoint.SERVER).percent(status == true);
service_instance_relation_client_resp_time = from(ServiceInstanceRelation.latency).filter(detectPoint == DetectPoint.CLIENT).longAvg();
service_instance_relation_server_resp_time = from(ServiceInstanceRelation.latency).filter(detectPoint == DetectPoint.SERVER).longAvg();
service_instance_relation_client_percentile = from(ServiceInstanceRelation.latency).filter(detectPoint == DetectPoint.CLIENT).percentile(10); // Multiple values including p50, p75, p90, p95, p99
service_instance_relation_server_percentile = from(ServiceInstanceRelation.latency).filter(detectPoint == DetectPoint.SERVER).percentile(10); // Multiple values including p50, p75, p90, p95, p99

// Service Instance Scope metrics
service_instance_sla = from(ServiceInstance.*).percent(status == true);
service_instance_resp_time= from(ServiceInstance.latency).longAvg();
service_instance_cpm = from(ServiceInstance.*).cpm();

// Endpoint scope metrics
endpoint_cpm = from(Endpoint.*).cpm();
endpoint_resp_time = from(Endpoint.latency).longAvg();
endpoint_sla = from(Endpoint.*).percent(status == true);
endpoint_percentile = from(Endpoint.latency).percentile(10); // Multiple values including p50, p75, p90, p95, p99
endpoint_mq_consume_count = from(Endpoint.*).filter(type == RequestType.MQ).count();
endpoint_mq_consume_latency = from((str->long)Endpoint.tag["transmission.latency"]).filter(type == RequestType.MQ).filter(tag["transmission.latency"] != null).longAvg();

// Endpoint relation scope metrics
endpoint_relation_cpm = from(EndpointRelation.*).filter(detectPoint == DetectPoint.SERVER).cpm();
endpoint_relation_resp_time = from(EndpointRelation.rpcLatency).filter(detectPoint == DetectPoint.SERVER).longAvg();
endpoint_relation_sla = from(EndpointRelation.*).filter(detectPoint == DetectPoint.SERVER).percent(status == true);
endpoint_relation_percentile = from(EndpointRelation.rpcLatency).filter(detectPoint == DetectPoint.SERVER).percentile(10); // Multiple values including p50, p75, p90, p95, p99

database_access_resp_time = from(DatabaseAccess.latency).longAvg();
database_access_sla = from(DatabaseAccess.*).percent(status == true);
database_access_cpm = from(DatabaseAccess.*).cpm();
database_access_percentile = from(DatabaseAccess.latency).percentile(10);
  • service_resp_time #服务的响应时间
  • service_sla #服务的http请求成功率SLA,比如99%等。
  • service_cpm #表示每分钟的吞吐量.
  • service_apdex : 应用性能指数是0.8是0.x
  • service_percentile: 指定最近多少数据范围内的响应时间百分比,即p99, p95, p90, p75, p50在内的数据统计结果
  • endpoint_relation_cpm #端点的每分钟的吞吐量
  • endpoint_relation_resp_time #端点的响应时间
  • endpoint_relation_sla #端点的http请求成功率SLA,比如99%等。
  • endpoint_relation_percentile ##端点的最近多少数据范围内的响应时间百分比,即p99、 p95、 p90、 p75、p50在内的数据统计结果

如果指标不满足自己的业务的需求,可以参考上面去定制

告警规则

skywalking的告警配置文件为config/alarm-settings.yml

普通告警

  • 规则名称:在告警信息中显示的唯一名称,必须以_rule结尾。
  • metrics-name:度量名称,也是OAL脚本中的度量名。默认配置中可以用于告警的度量有:服务实例端点服务关系实例关系端点关系。它只支持long,double和int类型。
  • include-names:包含在此规则之内的实体名称列表。
  • exclude-names:排除在此规则以外的实体名称列表。
  • include-names-regex:提供一个正则表达式来包含实体名称。如果同时设置包含名称列表和包含名称的正则表达式,则两个规则都将生效。
  • exclude-names-regex:提供一个正则表达式来排除实体名称。如果同时设置排除名称列表和排除名称的正则表达式,则两个规则都将生效。
  • include-labels:包含在此规则之内的标签。
  • exclude-labels:排除在此规则以外的标签。
  • include-labels-regex:提供一个正则表达式来包含标签。如果同时设置包含标签列表和包含标签的正则表达式,则两个规则都将生效。
  • exclude-labels-regex:提供一个正则表达式来排除标签。如果同时设置排除标签列表和排除标签的正则表达式,则两个规则都将生效。
  • threshold:阈值。对于多个值指标,例如percentile,阈值是一个数组。像value1 value2 value3 value4 value5这样描述。 每个值可以作为度量中每个值的阈值。如果不想通过此值或某些值触发警报,则将值设置为 -。 例如在percentile中,value1是P50的阈值,value2是P75的阈值,那么-,-,value3, value4, value5的意思是,没有阈值的P50和P75的percentile告警规则。
  • op:操作符,支持>, >=, <, <=, =
  • period:多久告警规则需要被检查一下。这是一个时间窗口,与后端部署环境时间相匹配。
  • count:在一个周期窗口中,如果按op计算超过阈值的次数达到count,则发送告警。
  • only-as-conditiontrue或者false,指定规则是否可以发送告警,或者仅作为复合规则的条件。
  • silence-period:在时间N中触发报警后,在N -> N + silence-period这段时间内不告警。默认情况下,它和period一样,这意味着相同的告警(同一个度量名称拥有相同的Id)在同一个周期内只会触发一次。
  • message:该规则触发时,发送的通知消息。

下面是两条告警示例:

  • 服务在2分钟内调用次数大于1,一次就触发,触发告警后静默2分钟
  • 服务响应时间在2分钟内超过100ms,一次就触发,触发告警后静默2分钟
root@8e2113b4496c:/skywalking# cat config/alarm-settings.yml
rules:
  service_cpm_rule:
    # 服务调用次数
    metrics-name: service_cpm
    op: ">"
    threshold: 1
    period: 2
    count: 1
    silence-period: 2
    message: 服务 {name} 访问次数大于1
  # Rule unique name, must be ended with `_rule`.
  service_resp_time_rule:
    metrics-name: service_resp_time
    op: ">"
    threshold: 100
    period: 2
    count: 1
    silence-period: 2
    message: Response time of service {name} is more than 100ms in last 2 minutes.
# webhook

官方的一些样例规则

rules:
  # Rule unique name, must be ended with `_rule`.
  endpoint_percent_rule:
    # Metrics value need to be long, double or int
    metrics-name: endpoint_percent
    threshold: 75
    op: <
    # The length of time to evaluate the metrics
    period: 10
    # How many times after the metrics match the condition, will trigger alarm
    count: 3
    # How many times of checks, the alarm keeps silence after alarm triggered, default as same as period.
    silence-period: 10
    # Specify if the rule can send notification or just as an condition of composite rule
    only-as-condition: false
    tags:
      level: WARNING
  service_percent_rule:
    metrics-name: service_percent
    # [Optional] Default, match all services in this metrics
    include-names:
      - service_a
      - service_b
    exclude-names:
      - service_c
    # Single value metrics threshold.
    threshold: 85
    op: <
    period: 10
    count: 4
    only-as-condition: false
  service_resp_time_percentile_rule:
    # Metrics value need to be long, double or int
    metrics-name: service_percentile
    op: ">"
    # Multiple value metrics threshold. Thresholds for P50, P75, P90, P95, P99.
    threshold: 1000,1000,1000,1000,1000
    period: 10
    count: 3
    silence-period: 5
    message: Percentile response time of service {name} alarm in 3 minutes of last 10 minutes, due to more than one condition of p50 > 1000, p75 > 1000, p90 > 1000, p95 > 1000, p99 > 1000
    only-as-condition: false
  meter_service_status_code_rule:
    metrics-name: meter_status_code
    exclude-labels:
      - "200"
    op: ">"
    threshold: 10
    period: 10
    count: 3
    silence-period: 5
    message: The request number of entity {name} non-200 status is more than expected.
    only-as-condition: false

读者自行根据上面的用法编写合适业务的告警监控规则,注意: endpoint 规则,相比 service、instance 规则耗费更多内存及资源~

复合规则

就是将多个规则进行判断:

composite-rules:
  comp_rule:
    # Must satisfied percent rule and resp time rule 
    expression: service_percent_rule && service_resp_time_percentile_rule
    message: Service {name} successful rate is less than 80% and P50 of response time is over 1000ms
    tags:
      level: CRITICAL
  • 规则名称:在告警信息中显示的唯一名称,必须以_rule结尾
  • expression:指定如何组成规则,支持&&, ||, ()操作符
  • message:该规则触发时,发送的通知消息

告警通知

钉钉告警

root@8e2113b4496c:/skywalking# cat config/alarm-settings.yml
# 略

dingtalkHooks:
  textTemplate: |-
    {
      "msgtype": "text",
      "text": {
        "content": "Apache SkyWalking Alarm: \n %s."
      }
    }
  webhooks:
    - url: https://oapi.dingtalk.com/robot/send?access_token=ceb8f51dddddddddddaf640cae1db92
      secret: SEC80939ddddddddd21bcb6e2652f

配置完规则重启服务,测试下接口访问

skywalking链路追踪增加告警配置

在告警或事件中可以看到触发信息

skywalking链路追踪增加告警配置

回到钉钉群中可以看到告警信息以推送过来

skywalking链路追踪增加告警配置

微信告警

微信告警配置样例

wechatHooks:
  textTemplate: |-
    {
      "msgtype": "text",
      "text": {
        "content": "Apache SkyWalking Alarm: \n %s."
      }
    }
  webhooks:
    - https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=dummy_key

飞书告警

飞书告警样例配置

feishuHooks:
  textTemplate: |-
    {
      "msg_type": "text",
      "content": {
        "text": "Apache SkyWalking Alarm: \n %s."
      },
      "ats":"feishu_user_id_1,feishu_user_id_2"
    }
  webhooks:
    - url: https://open.feishu.cn/open-apis/bot/v2/hook/dummy_token
      secret: dummysecret

参考文档

正文完
 
xadocker
版权声明:本站原创文章,由 xadocker 2023-03-09发表,共计9825字。
转载说明:除特殊说明外本站文章皆由CC-4.0协议发布,转载请注明出处。
评论(没有评论)