Redis 监控
1 Exporter 以及Service(服务)部署
1.编写 exporter 和 SVC
apiVersion: apps/v1
kind: Deployment
metadata:
name: redis-standalone-exporter
labels:
app: redis-standalone-exporter
spec:
replicas: 1
selector:
matchLabels:
app: redis-standalone-exporter
template:
metadata:
labels:
app: redis-standalone-exporter
spec:
containers:
- name: redis-standalone-exporter
image: oliver006/redis_exporter:v1.50.0
imagePullPolicy: IfNotPresent
# 此处添加redis相关配置,例如:地址、密码等
args: ["-redis.addr", "redis://10.111.143.64:6379", "-redis.password", "12345"]
# 如果是监控k8s容器内的Redis,则此处的redis.addr对应的值需要添加DNS前缀,类似下面注释的那样
# args: ["-redis.addr", "redis-standalone.monitorsoftware:6379", "-redis.password", "admin@123"]
ports:
- containerPort: 9121
# SVC
---
apiVersion: v1
kind: Service
metadata:
labels:
app: redis-standalone-exporter
name: redis-standalone-exporter
spec:
type: ClusterIP
ports:
- name: metrics
port: 9121
protocol: TCP
targetPort: 9121
selector:
app: redis-standalone-exporter
2 编写 ServiceMonitor 配置文件
# SM 服务发现
---
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
labels:
app: redis-standalone-exporter
prometheus: k8s
name: redis-standalone-exporter
namespace: monitoring
spec:
endpoints:
- interval: 15 # Prometheus 检查时间 15s
port: metrics
path: /metrics
params:
target:
# 此处为redis地址值
- 10.111.143.64:6379
relabelings:
- sourceLabels: [__param_target]
targetLabel: instance
namespaceSelector:
matchNames:
- default
selector:
matchLabels:
app: redis-standalone-exporter
查看监控 targets
查看对应 promQL 语句
redis 告警规则
# 配置redis报警规则
[root@Prometheus-Grafana rules]# pwd
/prometheus/rules
[root@Prometheus-Grafana rules]# vim redis-exporter.rules
groups:
- name: redis集群预警
# 配置redis报警规则
groups:
- name: redis集群预警
rules:
- alert: redis节点下线
expr: 'up{instance=~".*:9121"} == 0'
for: 5s
labels:
severity: critical
annotations:
message: '{{ $labels.instance }} redis 监控主节点下线, 请及时处理'
summary: '{{ $labels.instance }} redis 监控主节点下线'
- alert: redis服务下线
expr: 'redis_up{instance=~".*"} == 0'
for: 20s
labels:
severity: WARN
annotations:
message: '{{ $labels.instance }} redis 服务下线, 请及时处理'
summary: '{{ $labels.instance }} redis 服务下线'
- alert: redis已经24小时未备份
expr: time() - redis_rdb_last_save_timestamp_seconds > 60 * 60 * 24
for: 5m
labels:
severity: ERROR
annotations:
description: 'redis 集群节点: {{$labels.instance}} redis 已经 24 小时未备份, 请立即处理'
summary: 'Missing backup (instance {{ $labels.instance }})'
- alert: redis内存可用内存不足
expr: redis_mem_fragmentation_ratio < 1
for: 5m
labels:
severity: WARN
annotations:
description: 'Redis 当前节点 {{ $labels.instance }} redis内存可用内存不足,请减少key或增加内存'
- alert: redis内存碎片过大
expr: redis_mem_fragmentation_ratio > 18
for: 5m
labels:
severity: ERROR
annotations:
description: 'Redis 当前节点 {{ $labels.instance }} 内存碎片过大, 当前: {{ $value}}'
- alert: redis连接被拒绝
expr: 'increase(redis_rejected_connections_total[1m]) > 0'
for: 5m
labels:
alert_type: 连接被拒绝
severity: WARN
annotations:
description: 'redis 服务连接 {{ $labels.instance }} 被拒绝'
- alert: redis主节点缺失
expr: redis_instance_info{role="master"} == 0
for: 5m
labels:
severity: WARN
annotations:
summary: "{{ $labels.instance }} redis主节点缺失"
description: "{{ $labels.instance }} 主节点丢失5分钟"
- alert: redis副本下线
expr: delta(redis_connected_slaves[1m]) < 0
for: 5m
labels:
severity: WARN
annotations:
description: "redis {{ $labels.instance}} 集群副本下线, 请立即处理"
- alert: redis连接总数达到总量的85%
expr: redis_connected_clients > redis_config_maxclients * 0.85
for: 5m
labels:
severity: WARN
annotations:
description: "主机: {{ $labels.instance }} 当前连接数: {{ $value }}, 连接总数达到总量的85%"
- alert: redis连接总数达到总量的95%
expr: redis_connected_clients > redis_config_maxclients * 0.95
for: 5m
labels:
severity: ERROR
annotations:
description: "主机: {{ $labels.instance }} 当前连接数: {{ $value }}, 连接总数达到总量的95%"
- alert: redis连接数过低
expr: redis_connected_clients == 0
for: 5m
labels:
severity: WARN
alert_type: "连接数过低"
alert_host: "{{ reReplaceAll \":(.*)\" \"\" $labels.instance }}"
annotations:
description: "redis 当前: {{ $labels.instance }} 节点无连接"
- alert: redis连接故障
expr: irate(redis_blocked_clients{job="redis-sentinel"}[5m]) > 3
for: 5m
labels:
severity: WARN
annotations:
description: "当前: {{ $labels.alert_host }} 5分钟内阻塞进程大于 3, 请检查连接服务是否异常"
- alert: redis低命中率效率低下
expr: redis_keyspace_hits_total / (redis_keyspace_hits_total + redis_keyspace_misses_total) > 0.95
for: 5m
labels:
severity: ERROR
annotations:
description: "当前: {{ $labels.alert_host }} 命中率低下原因: 数据到期和分配给Redis的内存不足,请及时检查内存、数据"
- alert: redis异常同步
expr: irate(redis_rdb_changes_since_last_save[60m]) == 1
for: 60m
labels:
severity: ERROR
annotations:
description: "当前: {{ $labels.alert_host }} redis 某一台服务异常断开, 同步异常"
- alert: redis集群连接异常
expr: redis_master_link_up{master_host=~".*"} == 0
for: 5m
labels:
severity: WARN
annotations:
description: "当前: {{ $labels.alert_host }} redis 复制连接当前断开"
# redis_total_system_memory_bytes 指标缺失
# - alert: "内存使用大于95%"
# expr: redis_memory_used_bytes / redis_total_system_memory_bytes * 100 > 95
# for: 5m
# labels:
# severity: WARN
# annotations:
# description: "Redis 当前节点 {{ $labels.instance }} 内存已使用 {{ $value }}%"