2022-01-08 22:54:14 -05:00
|
|
|
package singleton
|
2020-12-19 23:18:27 -05:00
|
|
|
|
|
|
|
import (
|
2024-10-23 09:55:12 -04:00
|
|
|
"fmt"
|
2021-01-05 20:35:04 -05:00
|
|
|
"log"
|
2020-12-19 23:18:27 -05:00
|
|
|
"sync"
|
|
|
|
"time"
|
|
|
|
|
2022-04-30 07:23:19 -04:00
|
|
|
"github.com/jinzhu/copier"
|
|
|
|
|
2020-12-19 23:18:27 -05:00
|
|
|
"github.com/naiba/nezha/model"
|
|
|
|
)
|
|
|
|
|
2021-06-21 09:30:42 -04:00
|
|
|
const (
|
|
|
|
_RuleCheckNoData = iota
|
|
|
|
_RuleCheckFail
|
|
|
|
_RuleCheckPass
|
|
|
|
)
|
|
|
|
|
2021-11-06 04:00:08 -04:00
|
|
|
type NotificationHistory struct {
|
|
|
|
Duration time.Duration
|
|
|
|
Until time.Time
|
|
|
|
}
|
|
|
|
|
2020-12-19 23:18:27 -05:00
|
|
|
// 报警规则
|
2022-04-14 15:13:53 -04:00
|
|
|
var (
|
|
|
|
AlertsLock sync.RWMutex
|
|
|
|
Alerts []*model.AlertRule
|
2024-10-26 11:57:47 -04:00
|
|
|
alertsStore map[uint64]map[uint64][][]bool // [alert_id][server_id] -> 对应报警规则的检查结果
|
|
|
|
alertsPrevState map[uint64]map[uint64]uint8 // [alert_id][server_id] -> 对应报警规则的上一次报警状态
|
|
|
|
AlertsCycleTransferStatsStore map[uint64]*model.CycleTransferStats // [alert_id] -> 对应报警规则的周期流量统计
|
2022-04-14 15:13:53 -04:00
|
|
|
)
|
2020-12-19 23:18:27 -05:00
|
|
|
|
2022-04-11 10:51:02 -04:00
|
|
|
// addCycleTransferStatsInfo 向AlertsCycleTransferStatsStore中添加周期流量报警统计信息
|
2021-11-06 04:00:08 -04:00
|
|
|
func addCycleTransferStatsInfo(alert *model.AlertRule) {
|
|
|
|
if !alert.Enabled() {
|
|
|
|
return
|
|
|
|
}
|
|
|
|
for j := 0; j < len(alert.Rules); j++ {
|
|
|
|
if !alert.Rules[j].IsTransferDurationRule() {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
if AlertsCycleTransferStatsStore[alert.ID] == nil {
|
|
|
|
from := alert.Rules[j].GetTransferDurationStart()
|
2022-01-11 05:15:43 -05:00
|
|
|
to := alert.Rules[j].GetTransferDurationEnd()
|
2021-11-06 04:00:08 -04:00
|
|
|
AlertsCycleTransferStatsStore[alert.ID] = &model.CycleTransferStats{
|
|
|
|
Name: alert.Name,
|
|
|
|
From: from,
|
2022-01-11 05:15:43 -05:00
|
|
|
To: to,
|
2021-11-10 20:40:10 -05:00
|
|
|
Max: uint64(alert.Rules[j].Max),
|
|
|
|
Min: uint64(alert.Rules[j].Min),
|
2021-11-06 04:00:08 -04:00
|
|
|
ServerName: make(map[uint64]string),
|
|
|
|
Transfer: make(map[uint64]uint64),
|
|
|
|
NextUpdate: make(map[uint64]time.Time),
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2020-12-19 23:18:27 -05:00
|
|
|
}
|
|
|
|
|
2022-04-11 10:51:02 -04:00
|
|
|
// AlertSentinelStart 报警器启动
|
2021-01-23 20:41:35 -05:00
|
|
|
func AlertSentinelStart() {
|
2024-10-26 11:57:47 -04:00
|
|
|
alertsStore = make(map[uint64]map[uint64][][]bool)
|
2024-10-25 20:16:57 -04:00
|
|
|
alertsPrevState = make(map[uint64]map[uint64]uint8)
|
2021-11-06 04:00:08 -04:00
|
|
|
AlertsCycleTransferStatsStore = make(map[uint64]*model.CycleTransferStats)
|
|
|
|
AlertsLock.Lock()
|
|
|
|
if err := DB.Find(&Alerts).Error; err != nil {
|
2020-12-19 23:18:27 -05:00
|
|
|
panic(err)
|
|
|
|
}
|
2022-04-14 15:13:53 -04:00
|
|
|
for _, alert := range Alerts {
|
2024-10-26 11:57:47 -04:00
|
|
|
alertsStore[alert.ID] = make(map[uint64][][]bool)
|
2024-10-25 20:16:57 -04:00
|
|
|
alertsPrevState[alert.ID] = make(map[uint64]uint8)
|
2022-04-14 15:13:53 -04:00
|
|
|
addCycleTransferStatsInfo(alert)
|
2020-12-19 23:18:27 -05:00
|
|
|
}
|
2021-11-06 04:00:08 -04:00
|
|
|
AlertsLock.Unlock()
|
2020-12-19 23:18:27 -05:00
|
|
|
|
|
|
|
time.Sleep(time.Second * 10)
|
2021-01-05 20:35:04 -05:00
|
|
|
var lastPrint time.Time
|
|
|
|
var checkCount uint64
|
|
|
|
for {
|
|
|
|
startedAt := time.Now()
|
|
|
|
checkStatus()
|
|
|
|
checkCount++
|
|
|
|
if lastPrint.Before(startedAt.Add(-1 * time.Hour)) {
|
2021-04-20 07:30:34 -04:00
|
|
|
if Conf.Debug {
|
2021-09-27 09:18:09 -04:00
|
|
|
log.Println("NEZHA>> 报警规则检测每小时", checkCount, "次", startedAt, time.Now())
|
2021-04-20 07:30:34 -04:00
|
|
|
}
|
2021-01-05 20:35:04 -05:00
|
|
|
checkCount = 0
|
|
|
|
lastPrint = startedAt
|
|
|
|
}
|
2021-06-30 06:15:53 -04:00
|
|
|
time.Sleep(time.Until(startedAt.Add(time.Second * 3))) // 3秒钟检查一次
|
2021-01-05 20:35:04 -05:00
|
|
|
}
|
2020-12-19 23:18:27 -05:00
|
|
|
}
|
|
|
|
|
2024-10-26 11:57:47 -04:00
|
|
|
func OnRefreshOrAddAlert(alert *model.AlertRule) {
|
2021-11-06 04:00:08 -04:00
|
|
|
AlertsLock.Lock()
|
|
|
|
defer AlertsLock.Unlock()
|
2020-12-19 23:18:27 -05:00
|
|
|
delete(alertsStore, alert.ID)
|
2021-06-21 09:30:42 -04:00
|
|
|
delete(alertsPrevState, alert.ID)
|
2020-12-21 03:34:21 -05:00
|
|
|
var isEdit bool
|
2021-11-06 04:00:08 -04:00
|
|
|
for i := 0; i < len(Alerts); i++ {
|
|
|
|
if Alerts[i].ID == alert.ID {
|
2024-10-26 11:57:47 -04:00
|
|
|
Alerts[i] = alert
|
2020-12-21 03:34:21 -05:00
|
|
|
isEdit = true
|
2020-12-19 23:18:27 -05:00
|
|
|
}
|
|
|
|
}
|
2020-12-21 03:34:21 -05:00
|
|
|
if !isEdit {
|
2024-10-26 11:57:47 -04:00
|
|
|
Alerts = append(Alerts, alert)
|
2020-12-21 03:34:21 -05:00
|
|
|
}
|
2024-10-26 11:57:47 -04:00
|
|
|
alertsStore[alert.ID] = make(map[uint64][][]bool)
|
2024-10-25 20:16:57 -04:00
|
|
|
alertsPrevState[alert.ID] = make(map[uint64]uint8)
|
2021-11-06 04:00:08 -04:00
|
|
|
delete(AlertsCycleTransferStatsStore, alert.ID)
|
2024-10-26 11:57:47 -04:00
|
|
|
addCycleTransferStatsInfo(alert)
|
2020-12-19 23:18:27 -05:00
|
|
|
}
|
|
|
|
|
2024-10-25 20:16:57 -04:00
|
|
|
func OnDeleteAlert(id []uint64) {
|
2021-11-06 04:00:08 -04:00
|
|
|
AlertsLock.Lock()
|
|
|
|
defer AlertsLock.Unlock()
|
2024-10-25 20:16:57 -04:00
|
|
|
for _, i := range id {
|
|
|
|
delete(alertsStore, i)
|
|
|
|
delete(alertsPrevState, i)
|
|
|
|
currentAlerts := Alerts[:0]
|
|
|
|
for _, alert := range Alerts {
|
|
|
|
if alert.ID != i {
|
|
|
|
currentAlerts = append(currentAlerts, alert)
|
|
|
|
}
|
2020-12-19 23:18:27 -05:00
|
|
|
}
|
2024-10-25 20:16:57 -04:00
|
|
|
Alerts = currentAlerts
|
|
|
|
delete(AlertsCycleTransferStatsStore, i)
|
2020-12-19 23:18:27 -05:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-04-11 10:51:02 -04:00
|
|
|
// checkStatus 检查报警规则并发送报警
|
2020-12-19 23:18:27 -05:00
|
|
|
func checkStatus() {
|
2021-11-06 04:00:08 -04:00
|
|
|
AlertsLock.RLock()
|
|
|
|
defer AlertsLock.RUnlock()
|
2021-01-23 20:41:35 -05:00
|
|
|
ServerLock.RLock()
|
|
|
|
defer ServerLock.RUnlock()
|
2020-12-19 23:18:27 -05:00
|
|
|
|
2021-11-06 04:00:08 -04:00
|
|
|
for _, alert := range Alerts {
|
2020-12-21 10:56:08 -05:00
|
|
|
// 跳过未启用
|
2021-11-06 04:00:08 -04:00
|
|
|
if !alert.Enabled() {
|
2020-12-21 10:56:08 -05:00
|
|
|
continue
|
|
|
|
}
|
2021-01-23 20:41:35 -05:00
|
|
|
for _, server := range ServerList {
|
2020-12-19 23:18:27 -05:00
|
|
|
// 监测点
|
2021-01-05 20:35:04 -05:00
|
|
|
alertsStore[alert.ID][server.ID] = append(alertsStore[alert.
|
2021-11-06 04:00:08 -04:00
|
|
|
ID][server.ID], alert.Snapshot(AlertsCycleTransferStatsStore[alert.ID], server, DB))
|
2021-06-21 09:30:42 -04:00
|
|
|
// 发送通知,分为触发报警和恢复通知
|
|
|
|
max, passed := alert.Check(alertsStore[alert.ID][server.ID])
|
2022-04-18 07:59:42 -04:00
|
|
|
// 保存当前服务器状态信息
|
|
|
|
curServer := model.Server{}
|
|
|
|
copier.Copy(&curServer, server)
|
2022-09-12 16:01:08 -04:00
|
|
|
|
|
|
|
// 本次未通过检查
|
2021-06-21 09:30:42 -04:00
|
|
|
if !passed {
|
2022-09-12 16:01:08 -04:00
|
|
|
// 始终触发模式或上次检查不为失败时触发报警(跳过单次触发+上次失败的情况)
|
|
|
|
if alert.TriggerMode == model.ModeAlwaysTrigger || alertsPrevState[alert.ID][server.ID] != _RuleCheckFail {
|
|
|
|
alertsPrevState[alert.ID][server.ID] = _RuleCheckFail
|
2024-10-31 17:07:04 -04:00
|
|
|
message := fmt.Sprintf("[%s] %s(%s) %s", Localizer.T("Incident"),
|
2024-11-22 09:40:43 -05:00
|
|
|
server.Name, IPDesensitize(server.GeoIP.IP.Join()), alert.Name)
|
2022-09-13 23:14:23 -04:00
|
|
|
go SendTriggerTasks(alert.FailTriggerTasks, curServer.ID)
|
2024-10-23 09:55:12 -04:00
|
|
|
go SendNotification(alert.NotificationGroupID, message, NotificationMuteLabel.ServerIncident(server.ID, alert.ID), &curServer)
|
2023-04-15 07:04:04 -04:00
|
|
|
// 清除恢复通知的静音缓存
|
2024-10-23 09:55:12 -04:00
|
|
|
UnMuteNotification(alert.NotificationGroupID, NotificationMuteLabel.ServerIncidentResolved(server.ID, alert.ID))
|
2022-09-12 16:01:08 -04:00
|
|
|
}
|
2021-06-21 09:30:42 -04:00
|
|
|
} else {
|
2022-09-12 16:01:08 -04:00
|
|
|
// 本次通过检查但上一次的状态为失败,则发送恢复通知
|
2021-06-21 09:30:42 -04:00
|
|
|
if alertsPrevState[alert.ID][server.ID] == _RuleCheckFail {
|
2024-10-31 17:07:04 -04:00
|
|
|
message := fmt.Sprintf("[%s] %s(%s) %s", Localizer.T("Resolved"),
|
2024-11-22 09:40:43 -05:00
|
|
|
server.Name, IPDesensitize(server.GeoIP.IP.Join()), alert.Name)
|
2022-09-13 23:14:23 -04:00
|
|
|
go SendTriggerTasks(alert.RecoverTriggerTasks, curServer.ID)
|
2024-10-23 09:55:12 -04:00
|
|
|
go SendNotification(alert.NotificationGroupID, message, NotificationMuteLabel.ServerIncidentResolved(server.ID, alert.ID), &curServer)
|
2023-04-15 07:04:04 -04:00
|
|
|
// 清除失败通知的静音缓存
|
2024-10-23 09:55:12 -04:00
|
|
|
UnMuteNotification(alert.NotificationGroupID, NotificationMuteLabel.ServerIncident(server.ID, alert.ID))
|
2021-06-21 09:30:42 -04:00
|
|
|
}
|
|
|
|
alertsPrevState[alert.ID][server.ID] = _RuleCheckPass
|
2020-12-19 23:18:27 -05:00
|
|
|
}
|
|
|
|
// 清理旧数据
|
2021-01-06 08:20:02 -05:00
|
|
|
if max > 0 && max < len(alertsStore[alert.ID][server.ID]) {
|
|
|
|
alertsStore[alert.ID][server.ID] = alertsStore[alert.ID][server.ID][len(alertsStore[alert.ID][server.ID])-max:]
|
2020-12-19 23:18:27 -05:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|