添加代码注释+修正一个typo

This commit is contained in:
Akkia 2022-04-11 22:51:02 +08:00
parent 91a1e3fe22
commit 707985e5c8
No known key found for this signature in database
GPG Key ID: 464BA42A151C1E0F
11 changed files with 68 additions and 32 deletions

View File

@ -63,19 +63,20 @@ func initSystem() {
loadServers() //加载服务器列表
loadCrons() //加载计划任务
// 清理 服务请求记录 和 流量记录 的旧数据
// 每天的3:30 对 监控记录 和 流量记录 进行清理
_, err := singleton.Cron.AddFunc("0 30 3 * * *", cleanMonitorHistory)
if err != nil {
panic(err)
}
// 流量记录打点
// 每小时对流量记录进行打点
_, err = singleton.Cron.AddFunc("0 0 * * * *", recordTransferHourlyUsage)
if err != nil {
panic(err)
}
}
// recordTransferHourlyUsage 对流量记录进行打点
func recordTransferHourlyUsage() {
singleton.ServerLock.Lock()
defer singleton.ServerLock.Unlock()
@ -102,8 +103,9 @@ func recordTransferHourlyUsage() {
log.Println("NEZHA>> Cron 流量统计入库", len(txs), singleton.DB.Create(txs).Error)
}
// cleanMonitorHistory 清理无效或过时的 监控记录 和 流量记录
func cleanMonitorHistory() {
// 清理无效数据
// 清理已被删除的服务器的监控记录与流量记录
singleton.DB.Unscoped().Delete(&model.MonitorHistory{}, "created_at < ? OR monitor_id NOT IN (SELECT `id` FROM monitors)", time.Now().AddDate(0, 0, -30))
singleton.DB.Unscoped().Delete(&model.Transfer{}, "server_id NOT IN (SELECT `id` FROM servers)")
// 计算可清理流量记录的时长
@ -146,6 +148,7 @@ func cleanMonitorHistory() {
}
}
//loadServers 加载服务器列表并根据ID排序
func loadServers() {
var servers []model.Server
singleton.DB.Find(&servers)
@ -159,6 +162,7 @@ func loadServers() {
singleton.ReSortServer()
}
// loadCrons 加载计划任务
func loadCrons() {
var crons []model.Cron
singleton.DB.Find(&crons)
@ -172,6 +176,7 @@ func loadCrons() {
crIgnoreMap[cr.Servers[j]] = true
}
// 注册计划任务
cr.CronJobID, err = singleton.Cron.AddFunc(cr.Scheduler, singleton.CronTrigger(cr))
if err == nil {
singleton.Crons[cr.ID] = &cr
@ -192,7 +197,7 @@ func loadCrons() {
func main() {
cleanMonitorHistory()
go rpc.ServeRPC(singleton.Conf.GRPCPort)
serviceSentinelDispatchBus := make(chan model.Monitor)
serviceSentinelDispatchBus := make(chan model.Monitor) // 用于传递服务监控任务信息的channel
go rpc.DispatchTask(serviceSentinelDispatchBus)
go rpc.DispatchKeepalive()
go singleton.AlertSentinelStart()

View File

@ -43,6 +43,7 @@ func (r *AlertRule) Enabled() bool {
return r.Enable != nil && *r.Enable
}
// Snapshot 对传入的Server进行该报警规则下所有type的检查 返回包含每项检查结果的空接口
func (r *AlertRule) Snapshot(cycleTransferStats *CycleTransferStats, server *Server, db *gorm.DB) []interface{} {
var point []interface{}
for i := 0; i < len(r.Rules); i++ {
@ -51,9 +52,10 @@ func (r *AlertRule) Snapshot(cycleTransferStats *CycleTransferStats, server *Ser
return point
}
// Check 传入包含当前报警规则下所有type检查结果的空接口 返回报警持续时间与是否通过报警检查(通过则返回true)
func (r *AlertRule) Check(points [][]interface{}) (int, bool) {
var max int
var count int
var max int // 报警持续时间
var count int // 检查未通过的个数
for i := 0; i < len(r.Rules); i++ {
if r.Rules[i].IsTransferDurationRule() {
// 循环区间流量报警
@ -83,11 +85,13 @@ func (r *AlertRule) Check(points [][]interface{}) (int, bool) {
fail++
}
}
// 当70%以上的采样点未通过规则判断时 才认为当前检查未通过
if fail/total > 0.7 {
count++
break
}
}
}
// 仅当所有检查均未通过时 返回false
return max, count != len(r.Rules)
}

View File

@ -48,8 +48,9 @@ func (c *AgentConfig) Save() error {
return ioutil.WriteFile(c.v.ConfigFileUsed(), data, os.ModePerm)
}
// Config 站点配置
type Config struct {
Debug bool
Debug bool // debug模式开关
Site struct {
Brand string // 站点名称
CookieName string // 浏览器 Cookie 名称
@ -73,13 +74,14 @@ type Config struct {
EnablePlainIPInNotification bool
// IP变更提醒
Cover uint8 // 覆盖范围
IgnoredIPNotification string // 特定服务器
Cover uint8 // 覆盖范围0:提醒未被 IgnoredIPNotification 包含的所有服务器; 1:仅提醒被 IgnoredIPNotification 包含的服务器;
IgnoredIPNotification string // 特定服务器IP多个服务器用逗号分隔
v *viper.Viper
IgnoredIPNotificationServerIDs map[uint64]bool
IgnoredIPNotificationServerIDs map[uint64]bool // [ServerID] -> bool(值为true代表当前ServerID在特定服务器列表内
}
// Read 读取配置文件并应用
func (c *Config) Read(path string) error {
c.v = viper.New()
c.v.SetConfigFile(path)
@ -101,6 +103,7 @@ func (c *Config) Read(path string) error {
return nil
}
// updateIgnoredIPNotificationID 更新用于判断服务器ID是否属于特定服务器的map
func (c *Config) updateIgnoredIPNotificationID() {
c.IgnoredIPNotificationServerIDs = make(map[uint64]bool)
splitedIDs := strings.Split(c.IgnoredIPNotification, ",")
@ -112,6 +115,7 @@ func (c *Config) updateIgnoredIPNotificationID() {
}
}
// Save 保存配置文件
func (c *Config) Save() error {
c.updateIgnoredIPNotificationID()
data, err := yaml.Marshal(c)

View File

@ -22,7 +22,7 @@ type Cron struct {
PushSuccessful bool // 推送成功的通知
LastExecutedAt time.Time // 最后一次执行时间
LastResult bool // 最后一次执行结果
Cover uint8
Cover uint8 // 计划任务覆盖范围 (0:仅覆盖特定服务器 1:仅忽略特定服务器)
CronJobID cron.EntryID `gorm:"-"`
ServersRaw string

View File

@ -56,6 +56,7 @@ func (m *Monitor) PB() *pb.Task {
}
}
// CronSpec 返回服务监控请求间隔对应的 cron 表达式
func (m *Monitor) CronSpec() string {
if m.Duration == 0 {
// 默认间隔 30 秒
@ -76,6 +77,7 @@ func (m *Monitor) AfterFind(tx *gorm.DB) error {
return nil
}
// IsServiceSentinelNeeded 判断该任务类型是否需要进行服务监控 需要则返回true
func IsServiceSentinelNeeded(t uint64) bool {
return t != TaskTypeCommand && t != TaskTypeTerminal && t != TaskTypeUpgrade
}

View File

@ -4,6 +4,7 @@ import (
pb "github.com/naiba/nezha/proto"
)
// MonitorHistory 历史监控记录
type MonitorHistory struct {
Common
MonitorID uint64

View File

@ -159,10 +159,12 @@ func (u *Rule) Snapshot(cycleTransferStats *CycleTransferStats, server *Server,
return nil
}
// IsTransferDurationRule 判断该规则是否属于周期流量规则 属于则返回true
func (rule Rule) IsTransferDurationRule() bool {
return strings.HasSuffix(rule.Type, "_cycle")
}
// GetTransferDurationStart 获取周期流量的起始时间
func (rule Rule) GetTransferDurationStart() time.Time {
// Accept uppercase and lowercase
unit := strings.ToLower(rule.CycleUnit)
@ -202,6 +204,7 @@ func (rule Rule) GetTransferDurationStart() time.Time {
return startTime
}
// GetTransferDurationEnd 获取周期流量结束时间
func (rule Rule) GetTransferDurationEnd() time.Time {
// Accept uppercase and lowercase
unit := strings.ToLower(rule.CycleUnit)

View File

@ -23,10 +23,11 @@ type NotificationHistory struct {
// 报警规则
var AlertsLock sync.RWMutex
var Alerts []*model.AlertRule
var alertsStore map[uint64]map[uint64][][]interface{}
var alertsPrevState map[uint64]map[uint64]uint
var AlertsCycleTransferStatsStore map[uint64]*model.CycleTransferStats
var alertsStore map[uint64]map[uint64][][]interface{} // [alert_id][server_id] -> 对应报警规则的检查结果
var alertsPrevState map[uint64]map[uint64]uint // [alert_id][server_id] -> 对应报警规则的上一次报警状态
var AlertsCycleTransferStatsStore map[uint64]*model.CycleTransferStats // [alert_id] -> 对应报警规则的周期流量统计
// addCycleTransferStatsInfo 向AlertsCycleTransferStatsStore中添加周期流量报警统计信息
func addCycleTransferStatsInfo(alert *model.AlertRule) {
if !alert.Enabled() {
return
@ -52,6 +53,7 @@ func addCycleTransferStatsInfo(alert *model.AlertRule) {
}
}
// AlertSentinelStart 报警器启动
func AlertSentinelStart() {
alertsStore = make(map[uint64]map[uint64][][]interface{})
alertsPrevState = make(map[uint64]map[uint64]uint)
@ -120,6 +122,7 @@ func OnDeleteAlert(id uint64) {
delete(AlertsCycleTransferStatsStore, id)
}
// checkStatus 检查报警规则并发送报警
func checkStatus() {
AlertsLock.RLock()
defer AlertsLock.RUnlock()

View File

@ -16,6 +16,7 @@ const firstNotificationDelay = time.Minute * 15
var notifications []model.Notification
var notificationsLock sync.RWMutex
// LoadNotifications 加载通知方式到 singleton.notifications 变量
func LoadNotifications() {
notificationsLock.Lock()
if err := DB.Find(&notifications).Error; err != nil {

View File

@ -24,12 +24,14 @@ type ReportData struct {
Reporter uint64
}
// _TodayStatsOfMonitor 今日监控记录
type _TodayStatsOfMonitor struct {
Up int
Down int
Delay float32
Up int // 今日在线计数
Down int // 今日离线计数
Delay float32 // 今日平均延迟
}
// NewServiceSentinel 创建服务监控器
func NewServiceSentinel(serviceSentinelDispatchBus chan<- model.Monitor) {
ServiceSentinelShared = &ServiceSentinel{
serviceReportChannel: make(chan ReportData, 200),
@ -46,6 +48,7 @@ func NewServiceSentinel(serviceSentinelDispatchBus chan<- model.Monitor) {
monthlyStatus: make(map[uint64]*model.ServiceItemResponse),
dispatchBus: serviceSentinelDispatchBus,
}
// 加载历史记录
ServiceSentinelShared.loadMonitorHistory()
year, month, day := time.Now().Date()
@ -72,6 +75,7 @@ func NewServiceSentinel(serviceSentinelDispatchBus chan<- model.Monitor) {
ServiceSentinelShared.latestDate[k] = time.Now().Format("02-Jan-06")
}
// 启动服务监控器
go ServiceSentinelShared.worker()
// 每日将游标往后推一天
@ -88,20 +92,20 @@ func NewServiceSentinel(serviceSentinelDispatchBus chan<- model.Monitor) {
type ServiceSentinel struct {
serviceResponseDataStoreLock sync.RWMutex
monitorsLock sync.RWMutex
serviceReportChannel chan ReportData
serviceStatusToday map[uint64]*_TodayStatsOfMonitor
serviceCurrentStatusIndex map[uint64]int
serviceCurrentStatusData map[uint64][]model.MonitorHistory
latestDate map[uint64]string
serviceReportChannel chan ReportData // 服务状态汇报管道
serviceStatusToday map[uint64]*_TodayStatsOfMonitor // [monitor_id] -> _TodayStatsOfMonitor
serviceCurrentStatusIndex map[uint64]int // [monitor_id] -> 该监控ID对应的 serviceCurrentStatusData 的最新索引下标
serviceCurrentStatusData map[uint64][]model.MonitorHistory // [monitor_id] -> []model.MonitorHistory
latestDate map[uint64]string // 最近一次更新时间
lastStatus map[uint64]string
serviceResponseDataStoreCurrentUp map[uint64]uint64
serviceResponseDataStoreCurrentDown map[uint64]uint64
monitors map[uint64]*model.Monitor
serviceResponseDataStoreCurrentUp map[uint64]uint64 // [monitor_id] -> 当前服务在线计数
serviceResponseDataStoreCurrentDown map[uint64]uint64 // [monitor_id] -> 当前服务离线计数
monitors map[uint64]*model.Monitor // [monitor_id] -> model.Monitor
sslCertCache map[uint64]string
// 30天数据缓存
monthlyStatusLock sync.Mutex
monthlyStatus map[uint64]*model.ServiceItemResponse
// 服务监控调度计划任务
monthlyStatus map[uint64]*model.ServiceItemResponse // [monitor_id] -> model.ServiceItemResponse
// 服务监控任务调度管道
dispatchBus chan<- model.Monitor
}
@ -120,6 +124,7 @@ func (ss *ServiceSentinel) refreshMonthlyServiceStatus() {
}
}
// Dispatch 将传入的 ReportData 传给 服务状态汇报管道
func (ss *ServiceSentinel) Dispatch(r ReportData) {
ss.serviceReportChannel <- r
}
@ -137,6 +142,7 @@ func (ss *ServiceSentinel) Monitors() []*model.Monitor {
return monitors
}
// LoadStats 加载服务监控器的历史状态信息
func (ss *ServiceSentinel) loadMonitorHistory() {
var monitors []*model.Monitor
DB.Find(&monitors)
@ -146,6 +152,7 @@ func (ss *ServiceSentinel) loadMonitorHistory() {
ss.monitors = make(map[uint64]*model.Monitor)
for i := 0; i < len(monitors); i++ {
task := *monitors[i]
// 通过cron定时将服务监控任务传递给任务调度管道
monitors[i].CronJobID, err = Cron.AddFunc(task.CronSpec(), func() {
ss.dispatchBus <- task
})
@ -171,7 +178,7 @@ func (ss *ServiceSentinel) loadMonitorHistory() {
}
}
// 加载历史记录
// 加载服务监控历史记录
var mhs []model.MonitorHistory
DB.Where("created_at >= ? AND created_at < ?", today.AddDate(0, 0, -29), today).Find(&mhs)
for i := 0; i < len(mhs); i++ {
@ -266,6 +273,7 @@ func (ss *ServiceSentinel) LoadStats() map[uint64]*model.ServiceItemResponse {
return ss.monthlyStatus
}
// getStateStr 根据服务在线率返回对应的状态字符串
func getStateStr(percent uint64) string {
if percent == 0 {
return "无数据"
@ -279,7 +287,9 @@ func getStateStr(percent uint64) string {
return "故障"
}
// worker 服务监控的实际工作流程
func (ss *ServiceSentinel) worker() {
// 从服务状态汇报管道获取汇报的服务数据
for r := range ss.serviceReportChannel {
if ss.monitors[r.Data.GetId()] == nil || ss.monitors[r.Data.GetId()].ID == 0 {
log.Printf("NEZAH>> 错误的服务监控上报 %+v", r)
@ -355,7 +365,7 @@ func (ss *ServiceSentinel) worker() {
// SSL 证书报警
var errMsg string
if strings.HasPrefix(mh.Data, "SSL证书错误") {
// 排除 i/o timeont、connection timeout、EOF 错误
// 排除 i/o timeout、connection timeout、EOF 错误
if !strings.HasSuffix(mh.Data, "timeout") &&
!strings.HasSuffix(mh.Data, "EOF") &&
!strings.HasSuffix(mh.Data, "timed out") {

View File

@ -23,14 +23,15 @@ var (
DB *gorm.DB
Loc *time.Location
ServerList map[uint64]*model.Server
SecretToID map[string]uint64
ServerList map[uint64]*model.Server // [ServerID] -> model.Server
SecretToID map[string]uint64 // [ServerSecret] -> ServerID
ServerLock sync.RWMutex
SortedServerList []*model.Server
SortedServerList []*model.Server // 用于存储服务器列表的 slice按照服务器 ID 排序
SortedServerLock sync.RWMutex
)
// Init 初始化时区为上海时区
func Init() {
var err error
Loc, err = time.LoadLocation("Asia/Shanghai")
@ -39,6 +40,7 @@ func Init() {
}
}
// ReSortServer 根据服务器ID 对服务器列表进行排序ID越大越靠前
func ReSortServer() {
ServerLock.RLock()
defer ServerLock.RUnlock()
@ -50,6 +52,7 @@ func ReSortServer() {
SortedServerList = append(SortedServerList, s)
}
// 按照服务器 ID 排序的具体实现ID越大越靠前
sort.SliceStable(SortedServerList, func(i, j int) bool {
if SortedServerList[i].DisplayIndex == SortedServerList[j].DisplayIndex {
return SortedServerList[i].ID < SortedServerList[j].ID