nezha/service/singleton/alertsentinel.go

161 lines
4.6 KiB
Go
Raw Normal View History

2022-01-08 22:54:14 -05:00
package singleton
2020-12-19 23:18:27 -05:00
import (
"fmt"
2021-01-05 20:35:04 -05:00
"log"
2020-12-19 23:18:27 -05:00
"sync"
"time"
"github.com/naiba/nezha/model"
)
const (
_RuleCheckNoData = iota
_RuleCheckFail
_RuleCheckPass
)
2021-11-06 04:00:08 -04:00
type NotificationHistory struct {
Duration time.Duration
Until time.Time
}
2020-12-19 23:18:27 -05:00
// 报警规则
2021-11-06 04:00:08 -04:00
var AlertsLock sync.RWMutex
var Alerts []*model.AlertRule
2022-04-11 10:51:02 -04:00
var alertsStore map[uint64]map[uint64][][]interface{} // [alert_id][server_id] -> 对应报警规则的检查结果
var alertsPrevState map[uint64]map[uint64]uint // [alert_id][server_id] -> 对应报警规则的上一次报警状态
var AlertsCycleTransferStatsStore map[uint64]*model.CycleTransferStats // [alert_id] -> 对应报警规则的周期流量统计
2020-12-19 23:18:27 -05:00
2022-04-11 10:51:02 -04:00
// addCycleTransferStatsInfo 向AlertsCycleTransferStatsStore中添加周期流量报警统计信息
2021-11-06 04:00:08 -04:00
func addCycleTransferStatsInfo(alert *model.AlertRule) {
if !alert.Enabled() {
return
}
for j := 0; j < len(alert.Rules); j++ {
if !alert.Rules[j].IsTransferDurationRule() {
continue
}
if AlertsCycleTransferStatsStore[alert.ID] == nil {
from := alert.Rules[j].GetTransferDurationStart()
to := alert.Rules[j].GetTransferDurationEnd()
2021-11-06 04:00:08 -04:00
AlertsCycleTransferStatsStore[alert.ID] = &model.CycleTransferStats{
Name: alert.Name,
From: from,
To: to,
2021-11-10 20:40:10 -05:00
Max: uint64(alert.Rules[j].Max),
Min: uint64(alert.Rules[j].Min),
2021-11-06 04:00:08 -04:00
ServerName: make(map[uint64]string),
Transfer: make(map[uint64]uint64),
NextUpdate: make(map[uint64]time.Time),
}
}
}
2020-12-19 23:18:27 -05:00
}
2022-04-11 10:51:02 -04:00
// AlertSentinelStart 报警器启动
func AlertSentinelStart() {
2020-12-19 23:18:27 -05:00
alertsStore = make(map[uint64]map[uint64][][]interface{})
alertsPrevState = make(map[uint64]map[uint64]uint)
2021-11-06 04:00:08 -04:00
AlertsCycleTransferStatsStore = make(map[uint64]*model.CycleTransferStats)
AlertsLock.Lock()
if err := DB.Find(&Alerts).Error; err != nil {
2020-12-19 23:18:27 -05:00
panic(err)
}
2021-11-06 04:00:08 -04:00
for i := 0; i < len(Alerts); i++ {
alertsStore[Alerts[i].ID] = make(map[uint64][][]interface{})
alertsPrevState[Alerts[i].ID] = make(map[uint64]uint)
addCycleTransferStatsInfo(Alerts[i])
2020-12-19 23:18:27 -05:00
}
2021-11-06 04:00:08 -04:00
AlertsLock.Unlock()
2020-12-19 23:18:27 -05:00
time.Sleep(time.Second * 10)
2021-01-05 20:35:04 -05:00
var lastPrint time.Time
var checkCount uint64
for {
startedAt := time.Now()
checkStatus()
checkCount++
if lastPrint.Before(startedAt.Add(-1 * time.Hour)) {
if Conf.Debug {
2021-09-27 09:18:09 -04:00
log.Println("NEZHA>> 报警规则检测每小时", checkCount, "次", startedAt, time.Now())
}
2021-01-05 20:35:04 -05:00
checkCount = 0
lastPrint = startedAt
}
time.Sleep(time.Until(startedAt.Add(time.Second * 3))) // 3秒钟检查一次
2021-01-05 20:35:04 -05:00
}
2020-12-19 23:18:27 -05:00
}
func OnRefreshOrAddAlert(alert model.AlertRule) {
2021-11-06 04:00:08 -04:00
AlertsLock.Lock()
defer AlertsLock.Unlock()
2020-12-19 23:18:27 -05:00
delete(alertsStore, alert.ID)
delete(alertsPrevState, alert.ID)
2020-12-21 03:34:21 -05:00
var isEdit bool
2021-11-06 04:00:08 -04:00
for i := 0; i < len(Alerts); i++ {
if Alerts[i].ID == alert.ID {
Alerts[i] = &alert
2020-12-21 03:34:21 -05:00
isEdit = true
2020-12-19 23:18:27 -05:00
}
}
2020-12-21 03:34:21 -05:00
if !isEdit {
2021-11-06 04:00:08 -04:00
Alerts = append(Alerts, &alert)
2020-12-21 03:34:21 -05:00
}
2020-12-19 23:18:27 -05:00
alertsStore[alert.ID] = make(map[uint64][][]interface{})
alertsPrevState[alert.ID] = make(map[uint64]uint)
2021-11-06 04:00:08 -04:00
delete(AlertsCycleTransferStatsStore, alert.ID)
addCycleTransferStatsInfo(&alert)
2020-12-19 23:18:27 -05:00
}
func OnDeleteAlert(id uint64) {
2021-11-06 04:00:08 -04:00
AlertsLock.Lock()
defer AlertsLock.Unlock()
2020-12-19 23:18:27 -05:00
delete(alertsStore, id)
delete(alertsPrevState, id)
2021-11-06 04:00:08 -04:00
for i := 0; i < len(Alerts); i++ {
if Alerts[i].ID == id {
Alerts = append(Alerts[:i], Alerts[i+1:]...)
i--
2020-12-19 23:18:27 -05:00
}
}
2021-11-06 04:00:08 -04:00
delete(AlertsCycleTransferStatsStore, id)
2020-12-19 23:18:27 -05:00
}
2022-04-11 10:51:02 -04:00
// checkStatus 检查报警规则并发送报警
2020-12-19 23:18:27 -05:00
func checkStatus() {
2021-11-06 04:00:08 -04:00
AlertsLock.RLock()
defer AlertsLock.RUnlock()
ServerLock.RLock()
defer ServerLock.RUnlock()
2020-12-19 23:18:27 -05:00
2021-11-06 04:00:08 -04:00
for _, alert := range Alerts {
2020-12-21 10:56:08 -05:00
// 跳过未启用
2021-11-06 04:00:08 -04:00
if !alert.Enabled() {
2020-12-21 10:56:08 -05:00
continue
}
for _, server := range ServerList {
2020-12-19 23:18:27 -05:00
// 监测点
2021-01-05 20:35:04 -05:00
alertsStore[alert.ID][server.ID] = append(alertsStore[alert.
2021-11-06 04:00:08 -04:00
ID][server.ID], alert.Snapshot(AlertsCycleTransferStatsStore[alert.ID], server, DB))
// 发送通知,分为触发报警和恢复通知
max, passed := alert.Check(alertsStore[alert.ID][server.ID])
if !passed {
alertsPrevState[alert.ID][server.ID] = _RuleCheckFail
2022-02-19 01:29:06 -05:00
message := fmt.Sprintf("[主机故障] %s(%s) 规则:%s", server.Name, IPDesensitize(server.Host.IP), alert.Name)
go SendNotification(message, true)
} else {
if alertsPrevState[alert.ID][server.ID] == _RuleCheckFail {
2022-02-19 01:29:06 -05:00
message := fmt.Sprintf("[主机恢复] %s(%s) 规则:%s", server.Name, IPDesensitize(server.Host.IP), alert.Name)
go SendNotification(message, true)
}
alertsPrevState[alert.ID][server.ID] = _RuleCheckPass
2020-12-19 23:18:27 -05:00
}
// 清理旧数据
if max > 0 && max < len(alertsStore[alert.ID][server.ID]) {
alertsStore[alert.ID][server.ID] = alertsStore[alert.ID][server.ID][len(alertsStore[alert.ID][server.ID])-max:]
2020-12-19 23:18:27 -05:00
}
}
}
}