diff --git a/cmd/dashboard/main.go b/cmd/dashboard/main.go index e302ec8..fce8654 100644 --- a/cmd/dashboard/main.go +++ b/cmd/dashboard/main.go @@ -63,19 +63,20 @@ func initSystem() { loadServers() //加载服务器列表 loadCrons() //加载计划任务 - // 清理 服务请求记录 和 流量记录 的旧数据 + // 每天的3:30 对 监控记录 和 流量记录 进行清理 _, err := singleton.Cron.AddFunc("0 30 3 * * *", cleanMonitorHistory) if err != nil { panic(err) } - // 流量记录打点 + // 每小时对流量记录进行打点 _, err = singleton.Cron.AddFunc("0 0 * * * *", recordTransferHourlyUsage) if err != nil { panic(err) } } +// recordTransferHourlyUsage 对流量记录进行打点 func recordTransferHourlyUsage() { singleton.ServerLock.Lock() defer singleton.ServerLock.Unlock() @@ -102,8 +103,9 @@ func recordTransferHourlyUsage() { log.Println("NEZHA>> Cron 流量统计入库", len(txs), singleton.DB.Create(txs).Error) } +// cleanMonitorHistory 清理无效或过时的 监控记录 和 流量记录 func cleanMonitorHistory() { - // 清理无效数据 + // 清理已被删除的服务器的监控记录与流量记录 singleton.DB.Unscoped().Delete(&model.MonitorHistory{}, "created_at < ? OR monitor_id NOT IN (SELECT `id` FROM monitors)", time.Now().AddDate(0, 0, -30)) singleton.DB.Unscoped().Delete(&model.Transfer{}, "server_id NOT IN (SELECT `id` FROM servers)") // 计算可清理流量记录的时长 @@ -146,6 +148,7 @@ func cleanMonitorHistory() { } } +//loadServers 加载服务器列表并根据ID排序 func loadServers() { var servers []model.Server singleton.DB.Find(&servers) @@ -159,6 +162,7 @@ func loadServers() { singleton.ReSortServer() } +// loadCrons 加载计划任务 func loadCrons() { var crons []model.Cron singleton.DB.Find(&crons) @@ -172,6 +176,7 @@ func loadCrons() { crIgnoreMap[cr.Servers[j]] = true } + // 注册计划任务 cr.CronJobID, err = singleton.Cron.AddFunc(cr.Scheduler, singleton.CronTrigger(cr)) if err == nil { singleton.Crons[cr.ID] = &cr @@ -192,7 +197,7 @@ func loadCrons() { func main() { cleanMonitorHistory() go rpc.ServeRPC(singleton.Conf.GRPCPort) - serviceSentinelDispatchBus := make(chan model.Monitor) + serviceSentinelDispatchBus := make(chan model.Monitor) // 用于传递服务监控任务信息的channel go rpc.DispatchTask(serviceSentinelDispatchBus) go rpc.DispatchKeepalive() go singleton.AlertSentinelStart() diff --git a/model/alertrule.go b/model/alertrule.go index b0a6230..321480d 100644 --- a/model/alertrule.go +++ b/model/alertrule.go @@ -43,6 +43,7 @@ func (r *AlertRule) Enabled() bool { return r.Enable != nil && *r.Enable } +// Snapshot 对传入的Server进行该报警规则下所有type的检查 返回包含每项检查结果的空接口 func (r *AlertRule) Snapshot(cycleTransferStats *CycleTransferStats, server *Server, db *gorm.DB) []interface{} { var point []interface{} for i := 0; i < len(r.Rules); i++ { @@ -51,9 +52,10 @@ func (r *AlertRule) Snapshot(cycleTransferStats *CycleTransferStats, server *Ser return point } +// Check 传入包含当前报警规则下所有type检查结果的空接口 返回报警持续时间与是否通过报警检查(通过则返回true) func (r *AlertRule) Check(points [][]interface{}) (int, bool) { - var max int - var count int + var max int // 报警持续时间 + var count int // 检查未通过的个数 for i := 0; i < len(r.Rules); i++ { if r.Rules[i].IsTransferDurationRule() { // 循环区间流量报警 @@ -83,11 +85,13 @@ func (r *AlertRule) Check(points [][]interface{}) (int, bool) { fail++ } } + // 当70%以上的采样点未通过规则判断时 才认为当前检查未通过 if fail/total > 0.7 { count++ break } } } + // 仅当所有检查均未通过时 返回false return max, count != len(r.Rules) } diff --git a/model/config.go b/model/config.go index 11927c2..92e04ba 100644 --- a/model/config.go +++ b/model/config.go @@ -48,8 +48,9 @@ func (c *AgentConfig) Save() error { return ioutil.WriteFile(c.v.ConfigFileUsed(), data, os.ModePerm) } +// Config 站点配置 type Config struct { - Debug bool + Debug bool // debug模式开关 Site struct { Brand string // 站点名称 CookieName string // 浏览器 Cookie 名称 @@ -73,13 +74,14 @@ type Config struct { EnablePlainIPInNotification bool // IP变更提醒 - Cover uint8 // 覆盖范围 - IgnoredIPNotification string // 特定服务器 + Cover uint8 // 覆盖范围(0:提醒未被 IgnoredIPNotification 包含的所有服务器; 1:仅提醒被 IgnoredIPNotification 包含的服务器;) + IgnoredIPNotification string // 特定服务器IP(多个服务器用逗号分隔) v *viper.Viper - IgnoredIPNotificationServerIDs map[uint64]bool + IgnoredIPNotificationServerIDs map[uint64]bool // [ServerID] -> bool(值为true代表当前ServerID在特定服务器列表内) } +// Read 读取配置文件并应用 func (c *Config) Read(path string) error { c.v = viper.New() c.v.SetConfigFile(path) @@ -101,6 +103,7 @@ func (c *Config) Read(path string) error { return nil } +// updateIgnoredIPNotificationID 更新用于判断服务器ID是否属于特定服务器的map func (c *Config) updateIgnoredIPNotificationID() { c.IgnoredIPNotificationServerIDs = make(map[uint64]bool) splitedIDs := strings.Split(c.IgnoredIPNotification, ",") @@ -112,6 +115,7 @@ func (c *Config) updateIgnoredIPNotificationID() { } } +// Save 保存配置文件 func (c *Config) Save() error { c.updateIgnoredIPNotificationID() data, err := yaml.Marshal(c) diff --git a/model/cron.go b/model/cron.go index f4394c2..fbdf75a 100644 --- a/model/cron.go +++ b/model/cron.go @@ -22,7 +22,7 @@ type Cron struct { PushSuccessful bool // 推送成功的通知 LastExecutedAt time.Time // 最后一次执行时间 LastResult bool // 最后一次执行结果 - Cover uint8 + Cover uint8 // 计划任务覆盖范围 (0:仅覆盖特定服务器 1:仅忽略特定服务器) CronJobID cron.EntryID `gorm:"-"` ServersRaw string diff --git a/model/monitor.go b/model/monitor.go index b10a5a9..cdabccd 100644 --- a/model/monitor.go +++ b/model/monitor.go @@ -56,6 +56,7 @@ func (m *Monitor) PB() *pb.Task { } } +// CronSpec 返回服务监控请求间隔对应的 cron 表达式 func (m *Monitor) CronSpec() string { if m.Duration == 0 { // 默认间隔 30 秒 @@ -76,6 +77,7 @@ func (m *Monitor) AfterFind(tx *gorm.DB) error { return nil } +// IsServiceSentinelNeeded 判断该任务类型是否需要进行服务监控 需要则返回true func IsServiceSentinelNeeded(t uint64) bool { return t != TaskTypeCommand && t != TaskTypeTerminal && t != TaskTypeUpgrade } diff --git a/model/monitor_history.go b/model/monitor_history.go index 6e3f431..c7a5391 100644 --- a/model/monitor_history.go +++ b/model/monitor_history.go @@ -4,6 +4,7 @@ import ( pb "github.com/naiba/nezha/proto" ) +// MonitorHistory 历史监控记录 type MonitorHistory struct { Common MonitorID uint64 diff --git a/model/rule.go b/model/rule.go index bcec64c..8589665 100644 --- a/model/rule.go +++ b/model/rule.go @@ -159,10 +159,12 @@ func (u *Rule) Snapshot(cycleTransferStats *CycleTransferStats, server *Server, return nil } +// IsTransferDurationRule 判断该规则是否属于周期流量规则 属于则返回true func (rule Rule) IsTransferDurationRule() bool { return strings.HasSuffix(rule.Type, "_cycle") } +// GetTransferDurationStart 获取周期流量的起始时间 func (rule Rule) GetTransferDurationStart() time.Time { // Accept uppercase and lowercase unit := strings.ToLower(rule.CycleUnit) @@ -202,6 +204,7 @@ func (rule Rule) GetTransferDurationStart() time.Time { return startTime } +// GetTransferDurationEnd 获取周期流量结束时间 func (rule Rule) GetTransferDurationEnd() time.Time { // Accept uppercase and lowercase unit := strings.ToLower(rule.CycleUnit) diff --git a/service/singleton/alertsentinel.go b/service/singleton/alertsentinel.go index 1d7d3d9..8902700 100644 --- a/service/singleton/alertsentinel.go +++ b/service/singleton/alertsentinel.go @@ -23,10 +23,11 @@ type NotificationHistory struct { // 报警规则 var AlertsLock sync.RWMutex var Alerts []*model.AlertRule -var alertsStore map[uint64]map[uint64][][]interface{} -var alertsPrevState map[uint64]map[uint64]uint -var AlertsCycleTransferStatsStore map[uint64]*model.CycleTransferStats +var alertsStore map[uint64]map[uint64][][]interface{} // [alert_id][server_id] -> 对应报警规则的检查结果 +var alertsPrevState map[uint64]map[uint64]uint // [alert_id][server_id] -> 对应报警规则的上一次报警状态 +var AlertsCycleTransferStatsStore map[uint64]*model.CycleTransferStats // [alert_id] -> 对应报警规则的周期流量统计 +// addCycleTransferStatsInfo 向AlertsCycleTransferStatsStore中添加周期流量报警统计信息 func addCycleTransferStatsInfo(alert *model.AlertRule) { if !alert.Enabled() { return @@ -52,6 +53,7 @@ func addCycleTransferStatsInfo(alert *model.AlertRule) { } } +// AlertSentinelStart 报警器启动 func AlertSentinelStart() { alertsStore = make(map[uint64]map[uint64][][]interface{}) alertsPrevState = make(map[uint64]map[uint64]uint) @@ -120,6 +122,7 @@ func OnDeleteAlert(id uint64) { delete(AlertsCycleTransferStatsStore, id) } +// checkStatus 检查报警规则并发送报警 func checkStatus() { AlertsLock.RLock() defer AlertsLock.RUnlock() diff --git a/service/singleton/notification.go b/service/singleton/notification.go index d9f9d08..a7783d4 100644 --- a/service/singleton/notification.go +++ b/service/singleton/notification.go @@ -16,6 +16,7 @@ const firstNotificationDelay = time.Minute * 15 var notifications []model.Notification var notificationsLock sync.RWMutex +// LoadNotifications 加载通知方式到 singleton.notifications 变量 func LoadNotifications() { notificationsLock.Lock() if err := DB.Find(¬ifications).Error; err != nil { diff --git a/service/singleton/servicesentinel.go b/service/singleton/servicesentinel.go index f3465a0..50aa78f 100644 --- a/service/singleton/servicesentinel.go +++ b/service/singleton/servicesentinel.go @@ -24,12 +24,14 @@ type ReportData struct { Reporter uint64 } +// _TodayStatsOfMonitor 今日监控记录 type _TodayStatsOfMonitor struct { - Up int - Down int - Delay float32 + Up int // 今日在线计数 + Down int // 今日离线计数 + Delay float32 // 今日平均延迟 } +// NewServiceSentinel 创建服务监控器 func NewServiceSentinel(serviceSentinelDispatchBus chan<- model.Monitor) { ServiceSentinelShared = &ServiceSentinel{ serviceReportChannel: make(chan ReportData, 200), @@ -46,6 +48,7 @@ func NewServiceSentinel(serviceSentinelDispatchBus chan<- model.Monitor) { monthlyStatus: make(map[uint64]*model.ServiceItemResponse), dispatchBus: serviceSentinelDispatchBus, } + // 加载历史记录 ServiceSentinelShared.loadMonitorHistory() year, month, day := time.Now().Date() @@ -72,6 +75,7 @@ func NewServiceSentinel(serviceSentinelDispatchBus chan<- model.Monitor) { ServiceSentinelShared.latestDate[k] = time.Now().Format("02-Jan-06") } + // 启动服务监控器 go ServiceSentinelShared.worker() // 每日将游标往后推一天 @@ -88,20 +92,20 @@ func NewServiceSentinel(serviceSentinelDispatchBus chan<- model.Monitor) { type ServiceSentinel struct { serviceResponseDataStoreLock sync.RWMutex monitorsLock sync.RWMutex - serviceReportChannel chan ReportData - serviceStatusToday map[uint64]*_TodayStatsOfMonitor - serviceCurrentStatusIndex map[uint64]int - serviceCurrentStatusData map[uint64][]model.MonitorHistory - latestDate map[uint64]string + serviceReportChannel chan ReportData // 服务状态汇报管道 + serviceStatusToday map[uint64]*_TodayStatsOfMonitor // [monitor_id] -> _TodayStatsOfMonitor + serviceCurrentStatusIndex map[uint64]int // [monitor_id] -> 该监控ID对应的 serviceCurrentStatusData 的最新索引下标 + serviceCurrentStatusData map[uint64][]model.MonitorHistory // [monitor_id] -> []model.MonitorHistory + latestDate map[uint64]string // 最近一次更新时间 lastStatus map[uint64]string - serviceResponseDataStoreCurrentUp map[uint64]uint64 - serviceResponseDataStoreCurrentDown map[uint64]uint64 - monitors map[uint64]*model.Monitor + serviceResponseDataStoreCurrentUp map[uint64]uint64 // [monitor_id] -> 当前服务在线计数 + serviceResponseDataStoreCurrentDown map[uint64]uint64 // [monitor_id] -> 当前服务离线计数 + monitors map[uint64]*model.Monitor // [monitor_id] -> model.Monitor sslCertCache map[uint64]string // 30天数据缓存 monthlyStatusLock sync.Mutex - monthlyStatus map[uint64]*model.ServiceItemResponse - // 服务监控调度计划任务 + monthlyStatus map[uint64]*model.ServiceItemResponse // [monitor_id] -> model.ServiceItemResponse + // 服务监控任务调度管道 dispatchBus chan<- model.Monitor } @@ -120,6 +124,7 @@ func (ss *ServiceSentinel) refreshMonthlyServiceStatus() { } } +// Dispatch 将传入的 ReportData 传给 服务状态汇报管道 func (ss *ServiceSentinel) Dispatch(r ReportData) { ss.serviceReportChannel <- r } @@ -137,6 +142,7 @@ func (ss *ServiceSentinel) Monitors() []*model.Monitor { return monitors } +// LoadStats 加载服务监控器的历史状态信息 func (ss *ServiceSentinel) loadMonitorHistory() { var monitors []*model.Monitor DB.Find(&monitors) @@ -146,6 +152,7 @@ func (ss *ServiceSentinel) loadMonitorHistory() { ss.monitors = make(map[uint64]*model.Monitor) for i := 0; i < len(monitors); i++ { task := *monitors[i] + // 通过cron定时将服务监控任务传递给任务调度管道 monitors[i].CronJobID, err = Cron.AddFunc(task.CronSpec(), func() { ss.dispatchBus <- task }) @@ -171,7 +178,7 @@ func (ss *ServiceSentinel) loadMonitorHistory() { } } - // 加载历史记录 + // 加载服务监控历史记录 var mhs []model.MonitorHistory DB.Where("created_at >= ? AND created_at < ?", today.AddDate(0, 0, -29), today).Find(&mhs) for i := 0; i < len(mhs); i++ { @@ -266,6 +273,7 @@ func (ss *ServiceSentinel) LoadStats() map[uint64]*model.ServiceItemResponse { return ss.monthlyStatus } +// getStateStr 根据服务在线率返回对应的状态字符串 func getStateStr(percent uint64) string { if percent == 0 { return "无数据" @@ -279,7 +287,9 @@ func getStateStr(percent uint64) string { return "故障" } +// worker 服务监控的实际工作流程 func (ss *ServiceSentinel) worker() { + // 从服务状态汇报管道获取汇报的服务数据 for r := range ss.serviceReportChannel { if ss.monitors[r.Data.GetId()] == nil || ss.monitors[r.Data.GetId()].ID == 0 { log.Printf("NEZAH>> 错误的服务监控上报 %+v", r) @@ -355,7 +365,7 @@ func (ss *ServiceSentinel) worker() { // SSL 证书报警 var errMsg string if strings.HasPrefix(mh.Data, "SSL证书错误:") { - // 排除 i/o timeont、connection timeout、EOF 错误 + // 排除 i/o timeout、connection timeout、EOF 错误 if !strings.HasSuffix(mh.Data, "timeout") && !strings.HasSuffix(mh.Data, "EOF") && !strings.HasSuffix(mh.Data, "timed out") { diff --git a/service/singleton/singleton.go b/service/singleton/singleton.go index f82bf80..531ea5a 100644 --- a/service/singleton/singleton.go +++ b/service/singleton/singleton.go @@ -23,14 +23,15 @@ var ( DB *gorm.DB Loc *time.Location - ServerList map[uint64]*model.Server - SecretToID map[string]uint64 + ServerList map[uint64]*model.Server // [ServerID] -> model.Server + SecretToID map[string]uint64 // [ServerSecret] -> ServerID ServerLock sync.RWMutex - SortedServerList []*model.Server + SortedServerList []*model.Server // 用于存储服务器列表的 slice,按照服务器 ID 排序 SortedServerLock sync.RWMutex ) +// Init 初始化时区为上海时区 func Init() { var err error Loc, err = time.LoadLocation("Asia/Shanghai") @@ -39,6 +40,7 @@ func Init() { } } +// ReSortServer 根据服务器ID 对服务器列表进行排序(ID越大越靠前) func ReSortServer() { ServerLock.RLock() defer ServerLock.RUnlock() @@ -50,6 +52,7 @@ func ReSortServer() { SortedServerList = append(SortedServerList, s) } + // 按照服务器 ID 排序的具体实现(ID越大越靠前) sort.SliceStable(SortedServerList, func(i, j int) bool { if SortedServerList[i].DisplayIndex == SortedServerList[j].DisplayIndex { return SortedServerList[i].ID < SortedServerList[j].ID