mirror of
https://github.com/nezhahq/nezha.git
synced 2025-01-22 12:48:14 -05:00
添加代码注释+修正一个typo
This commit is contained in:
parent
91a1e3fe22
commit
707985e5c8
@ -63,19 +63,20 @@ func initSystem() {
|
||||
loadServers() //加载服务器列表
|
||||
loadCrons() //加载计划任务
|
||||
|
||||
// 清理 服务请求记录 和 流量记录 的旧数据
|
||||
// 每天的3:30 对 监控记录 和 流量记录 进行清理
|
||||
_, err := singleton.Cron.AddFunc("0 30 3 * * *", cleanMonitorHistory)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
// 流量记录打点
|
||||
// 每小时对流量记录进行打点
|
||||
_, err = singleton.Cron.AddFunc("0 0 * * * *", recordTransferHourlyUsage)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
||||
|
||||
// recordTransferHourlyUsage 对流量记录进行打点
|
||||
func recordTransferHourlyUsage() {
|
||||
singleton.ServerLock.Lock()
|
||||
defer singleton.ServerLock.Unlock()
|
||||
@ -102,8 +103,9 @@ func recordTransferHourlyUsage() {
|
||||
log.Println("NEZHA>> Cron 流量统计入库", len(txs), singleton.DB.Create(txs).Error)
|
||||
}
|
||||
|
||||
// cleanMonitorHistory 清理无效或过时的 监控记录 和 流量记录
|
||||
func cleanMonitorHistory() {
|
||||
// 清理无效数据
|
||||
// 清理已被删除的服务器的监控记录与流量记录
|
||||
singleton.DB.Unscoped().Delete(&model.MonitorHistory{}, "created_at < ? OR monitor_id NOT IN (SELECT `id` FROM monitors)", time.Now().AddDate(0, 0, -30))
|
||||
singleton.DB.Unscoped().Delete(&model.Transfer{}, "server_id NOT IN (SELECT `id` FROM servers)")
|
||||
// 计算可清理流量记录的时长
|
||||
@ -146,6 +148,7 @@ func cleanMonitorHistory() {
|
||||
}
|
||||
}
|
||||
|
||||
//loadServers 加载服务器列表并根据ID排序
|
||||
func loadServers() {
|
||||
var servers []model.Server
|
||||
singleton.DB.Find(&servers)
|
||||
@ -159,6 +162,7 @@ func loadServers() {
|
||||
singleton.ReSortServer()
|
||||
}
|
||||
|
||||
// loadCrons 加载计划任务
|
||||
func loadCrons() {
|
||||
var crons []model.Cron
|
||||
singleton.DB.Find(&crons)
|
||||
@ -172,6 +176,7 @@ func loadCrons() {
|
||||
crIgnoreMap[cr.Servers[j]] = true
|
||||
}
|
||||
|
||||
// 注册计划任务
|
||||
cr.CronJobID, err = singleton.Cron.AddFunc(cr.Scheduler, singleton.CronTrigger(cr))
|
||||
if err == nil {
|
||||
singleton.Crons[cr.ID] = &cr
|
||||
@ -192,7 +197,7 @@ func loadCrons() {
|
||||
func main() {
|
||||
cleanMonitorHistory()
|
||||
go rpc.ServeRPC(singleton.Conf.GRPCPort)
|
||||
serviceSentinelDispatchBus := make(chan model.Monitor)
|
||||
serviceSentinelDispatchBus := make(chan model.Monitor) // 用于传递服务监控任务信息的channel
|
||||
go rpc.DispatchTask(serviceSentinelDispatchBus)
|
||||
go rpc.DispatchKeepalive()
|
||||
go singleton.AlertSentinelStart()
|
||||
|
@ -43,6 +43,7 @@ func (r *AlertRule) Enabled() bool {
|
||||
return r.Enable != nil && *r.Enable
|
||||
}
|
||||
|
||||
// Snapshot 对传入的Server进行该报警规则下所有type的检查 返回包含每项检查结果的空接口
|
||||
func (r *AlertRule) Snapshot(cycleTransferStats *CycleTransferStats, server *Server, db *gorm.DB) []interface{} {
|
||||
var point []interface{}
|
||||
for i := 0; i < len(r.Rules); i++ {
|
||||
@ -51,9 +52,10 @@ func (r *AlertRule) Snapshot(cycleTransferStats *CycleTransferStats, server *Ser
|
||||
return point
|
||||
}
|
||||
|
||||
// Check 传入包含当前报警规则下所有type检查结果的空接口 返回报警持续时间与是否通过报警检查(通过则返回true)
|
||||
func (r *AlertRule) Check(points [][]interface{}) (int, bool) {
|
||||
var max int
|
||||
var count int
|
||||
var max int // 报警持续时间
|
||||
var count int // 检查未通过的个数
|
||||
for i := 0; i < len(r.Rules); i++ {
|
||||
if r.Rules[i].IsTransferDurationRule() {
|
||||
// 循环区间流量报警
|
||||
@ -83,11 +85,13 @@ func (r *AlertRule) Check(points [][]interface{}) (int, bool) {
|
||||
fail++
|
||||
}
|
||||
}
|
||||
// 当70%以上的采样点未通过规则判断时 才认为当前检查未通过
|
||||
if fail/total > 0.7 {
|
||||
count++
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
// 仅当所有检查均未通过时 返回false
|
||||
return max, count != len(r.Rules)
|
||||
}
|
||||
|
@ -48,8 +48,9 @@ func (c *AgentConfig) Save() error {
|
||||
return ioutil.WriteFile(c.v.ConfigFileUsed(), data, os.ModePerm)
|
||||
}
|
||||
|
||||
// Config 站点配置
|
||||
type Config struct {
|
||||
Debug bool
|
||||
Debug bool // debug模式开关
|
||||
Site struct {
|
||||
Brand string // 站点名称
|
||||
CookieName string // 浏览器 Cookie 名称
|
||||
@ -73,13 +74,14 @@ type Config struct {
|
||||
EnablePlainIPInNotification bool
|
||||
|
||||
// IP变更提醒
|
||||
Cover uint8 // 覆盖范围
|
||||
IgnoredIPNotification string // 特定服务器
|
||||
Cover uint8 // 覆盖范围(0:提醒未被 IgnoredIPNotification 包含的所有服务器; 1:仅提醒被 IgnoredIPNotification 包含的服务器;)
|
||||
IgnoredIPNotification string // 特定服务器IP(多个服务器用逗号分隔)
|
||||
|
||||
v *viper.Viper
|
||||
IgnoredIPNotificationServerIDs map[uint64]bool
|
||||
IgnoredIPNotificationServerIDs map[uint64]bool // [ServerID] -> bool(值为true代表当前ServerID在特定服务器列表内)
|
||||
}
|
||||
|
||||
// Read 读取配置文件并应用
|
||||
func (c *Config) Read(path string) error {
|
||||
c.v = viper.New()
|
||||
c.v.SetConfigFile(path)
|
||||
@ -101,6 +103,7 @@ func (c *Config) Read(path string) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
// updateIgnoredIPNotificationID 更新用于判断服务器ID是否属于特定服务器的map
|
||||
func (c *Config) updateIgnoredIPNotificationID() {
|
||||
c.IgnoredIPNotificationServerIDs = make(map[uint64]bool)
|
||||
splitedIDs := strings.Split(c.IgnoredIPNotification, ",")
|
||||
@ -112,6 +115,7 @@ func (c *Config) updateIgnoredIPNotificationID() {
|
||||
}
|
||||
}
|
||||
|
||||
// Save 保存配置文件
|
||||
func (c *Config) Save() error {
|
||||
c.updateIgnoredIPNotificationID()
|
||||
data, err := yaml.Marshal(c)
|
||||
|
@ -22,7 +22,7 @@ type Cron struct {
|
||||
PushSuccessful bool // 推送成功的通知
|
||||
LastExecutedAt time.Time // 最后一次执行时间
|
||||
LastResult bool // 最后一次执行结果
|
||||
Cover uint8
|
||||
Cover uint8 // 计划任务覆盖范围 (0:仅覆盖特定服务器 1:仅忽略特定服务器)
|
||||
|
||||
CronJobID cron.EntryID `gorm:"-"`
|
||||
ServersRaw string
|
||||
|
@ -56,6 +56,7 @@ func (m *Monitor) PB() *pb.Task {
|
||||
}
|
||||
}
|
||||
|
||||
// CronSpec 返回服务监控请求间隔对应的 cron 表达式
|
||||
func (m *Monitor) CronSpec() string {
|
||||
if m.Duration == 0 {
|
||||
// 默认间隔 30 秒
|
||||
@ -76,6 +77,7 @@ func (m *Monitor) AfterFind(tx *gorm.DB) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
// IsServiceSentinelNeeded 判断该任务类型是否需要进行服务监控 需要则返回true
|
||||
func IsServiceSentinelNeeded(t uint64) bool {
|
||||
return t != TaskTypeCommand && t != TaskTypeTerminal && t != TaskTypeUpgrade
|
||||
}
|
||||
|
@ -4,6 +4,7 @@ import (
|
||||
pb "github.com/naiba/nezha/proto"
|
||||
)
|
||||
|
||||
// MonitorHistory 历史监控记录
|
||||
type MonitorHistory struct {
|
||||
Common
|
||||
MonitorID uint64
|
||||
|
@ -159,10 +159,12 @@ func (u *Rule) Snapshot(cycleTransferStats *CycleTransferStats, server *Server,
|
||||
return nil
|
||||
}
|
||||
|
||||
// IsTransferDurationRule 判断该规则是否属于周期流量规则 属于则返回true
|
||||
func (rule Rule) IsTransferDurationRule() bool {
|
||||
return strings.HasSuffix(rule.Type, "_cycle")
|
||||
}
|
||||
|
||||
// GetTransferDurationStart 获取周期流量的起始时间
|
||||
func (rule Rule) GetTransferDurationStart() time.Time {
|
||||
// Accept uppercase and lowercase
|
||||
unit := strings.ToLower(rule.CycleUnit)
|
||||
@ -202,6 +204,7 @@ func (rule Rule) GetTransferDurationStart() time.Time {
|
||||
return startTime
|
||||
}
|
||||
|
||||
// GetTransferDurationEnd 获取周期流量结束时间
|
||||
func (rule Rule) GetTransferDurationEnd() time.Time {
|
||||
// Accept uppercase and lowercase
|
||||
unit := strings.ToLower(rule.CycleUnit)
|
||||
|
@ -23,10 +23,11 @@ type NotificationHistory struct {
|
||||
// 报警规则
|
||||
var AlertsLock sync.RWMutex
|
||||
var Alerts []*model.AlertRule
|
||||
var alertsStore map[uint64]map[uint64][][]interface{}
|
||||
var alertsPrevState map[uint64]map[uint64]uint
|
||||
var AlertsCycleTransferStatsStore map[uint64]*model.CycleTransferStats
|
||||
var alertsStore map[uint64]map[uint64][][]interface{} // [alert_id][server_id] -> 对应报警规则的检查结果
|
||||
var alertsPrevState map[uint64]map[uint64]uint // [alert_id][server_id] -> 对应报警规则的上一次报警状态
|
||||
var AlertsCycleTransferStatsStore map[uint64]*model.CycleTransferStats // [alert_id] -> 对应报警规则的周期流量统计
|
||||
|
||||
// addCycleTransferStatsInfo 向AlertsCycleTransferStatsStore中添加周期流量报警统计信息
|
||||
func addCycleTransferStatsInfo(alert *model.AlertRule) {
|
||||
if !alert.Enabled() {
|
||||
return
|
||||
@ -52,6 +53,7 @@ func addCycleTransferStatsInfo(alert *model.AlertRule) {
|
||||
}
|
||||
}
|
||||
|
||||
// AlertSentinelStart 报警器启动
|
||||
func AlertSentinelStart() {
|
||||
alertsStore = make(map[uint64]map[uint64][][]interface{})
|
||||
alertsPrevState = make(map[uint64]map[uint64]uint)
|
||||
@ -120,6 +122,7 @@ func OnDeleteAlert(id uint64) {
|
||||
delete(AlertsCycleTransferStatsStore, id)
|
||||
}
|
||||
|
||||
// checkStatus 检查报警规则并发送报警
|
||||
func checkStatus() {
|
||||
AlertsLock.RLock()
|
||||
defer AlertsLock.RUnlock()
|
||||
|
@ -16,6 +16,7 @@ const firstNotificationDelay = time.Minute * 15
|
||||
var notifications []model.Notification
|
||||
var notificationsLock sync.RWMutex
|
||||
|
||||
// LoadNotifications 加载通知方式到 singleton.notifications 变量
|
||||
func LoadNotifications() {
|
||||
notificationsLock.Lock()
|
||||
if err := DB.Find(¬ifications).Error; err != nil {
|
||||
|
@ -24,12 +24,14 @@ type ReportData struct {
|
||||
Reporter uint64
|
||||
}
|
||||
|
||||
// _TodayStatsOfMonitor 今日监控记录
|
||||
type _TodayStatsOfMonitor struct {
|
||||
Up int
|
||||
Down int
|
||||
Delay float32
|
||||
Up int // 今日在线计数
|
||||
Down int // 今日离线计数
|
||||
Delay float32 // 今日平均延迟
|
||||
}
|
||||
|
||||
// NewServiceSentinel 创建服务监控器
|
||||
func NewServiceSentinel(serviceSentinelDispatchBus chan<- model.Monitor) {
|
||||
ServiceSentinelShared = &ServiceSentinel{
|
||||
serviceReportChannel: make(chan ReportData, 200),
|
||||
@ -46,6 +48,7 @@ func NewServiceSentinel(serviceSentinelDispatchBus chan<- model.Monitor) {
|
||||
monthlyStatus: make(map[uint64]*model.ServiceItemResponse),
|
||||
dispatchBus: serviceSentinelDispatchBus,
|
||||
}
|
||||
// 加载历史记录
|
||||
ServiceSentinelShared.loadMonitorHistory()
|
||||
|
||||
year, month, day := time.Now().Date()
|
||||
@ -72,6 +75,7 @@ func NewServiceSentinel(serviceSentinelDispatchBus chan<- model.Monitor) {
|
||||
ServiceSentinelShared.latestDate[k] = time.Now().Format("02-Jan-06")
|
||||
}
|
||||
|
||||
// 启动服务监控器
|
||||
go ServiceSentinelShared.worker()
|
||||
|
||||
// 每日将游标往后推一天
|
||||
@ -88,20 +92,20 @@ func NewServiceSentinel(serviceSentinelDispatchBus chan<- model.Monitor) {
|
||||
type ServiceSentinel struct {
|
||||
serviceResponseDataStoreLock sync.RWMutex
|
||||
monitorsLock sync.RWMutex
|
||||
serviceReportChannel chan ReportData
|
||||
serviceStatusToday map[uint64]*_TodayStatsOfMonitor
|
||||
serviceCurrentStatusIndex map[uint64]int
|
||||
serviceCurrentStatusData map[uint64][]model.MonitorHistory
|
||||
latestDate map[uint64]string
|
||||
serviceReportChannel chan ReportData // 服务状态汇报管道
|
||||
serviceStatusToday map[uint64]*_TodayStatsOfMonitor // [monitor_id] -> _TodayStatsOfMonitor
|
||||
serviceCurrentStatusIndex map[uint64]int // [monitor_id] -> 该监控ID对应的 serviceCurrentStatusData 的最新索引下标
|
||||
serviceCurrentStatusData map[uint64][]model.MonitorHistory // [monitor_id] -> []model.MonitorHistory
|
||||
latestDate map[uint64]string // 最近一次更新时间
|
||||
lastStatus map[uint64]string
|
||||
serviceResponseDataStoreCurrentUp map[uint64]uint64
|
||||
serviceResponseDataStoreCurrentDown map[uint64]uint64
|
||||
monitors map[uint64]*model.Monitor
|
||||
serviceResponseDataStoreCurrentUp map[uint64]uint64 // [monitor_id] -> 当前服务在线计数
|
||||
serviceResponseDataStoreCurrentDown map[uint64]uint64 // [monitor_id] -> 当前服务离线计数
|
||||
monitors map[uint64]*model.Monitor // [monitor_id] -> model.Monitor
|
||||
sslCertCache map[uint64]string
|
||||
// 30天数据缓存
|
||||
monthlyStatusLock sync.Mutex
|
||||
monthlyStatus map[uint64]*model.ServiceItemResponse
|
||||
// 服务监控调度计划任务
|
||||
monthlyStatus map[uint64]*model.ServiceItemResponse // [monitor_id] -> model.ServiceItemResponse
|
||||
// 服务监控任务调度管道
|
||||
dispatchBus chan<- model.Monitor
|
||||
}
|
||||
|
||||
@ -120,6 +124,7 @@ func (ss *ServiceSentinel) refreshMonthlyServiceStatus() {
|
||||
}
|
||||
}
|
||||
|
||||
// Dispatch 将传入的 ReportData 传给 服务状态汇报管道
|
||||
func (ss *ServiceSentinel) Dispatch(r ReportData) {
|
||||
ss.serviceReportChannel <- r
|
||||
}
|
||||
@ -137,6 +142,7 @@ func (ss *ServiceSentinel) Monitors() []*model.Monitor {
|
||||
return monitors
|
||||
}
|
||||
|
||||
// LoadStats 加载服务监控器的历史状态信息
|
||||
func (ss *ServiceSentinel) loadMonitorHistory() {
|
||||
var monitors []*model.Monitor
|
||||
DB.Find(&monitors)
|
||||
@ -146,6 +152,7 @@ func (ss *ServiceSentinel) loadMonitorHistory() {
|
||||
ss.monitors = make(map[uint64]*model.Monitor)
|
||||
for i := 0; i < len(monitors); i++ {
|
||||
task := *monitors[i]
|
||||
// 通过cron定时将服务监控任务传递给任务调度管道
|
||||
monitors[i].CronJobID, err = Cron.AddFunc(task.CronSpec(), func() {
|
||||
ss.dispatchBus <- task
|
||||
})
|
||||
@ -171,7 +178,7 @@ func (ss *ServiceSentinel) loadMonitorHistory() {
|
||||
}
|
||||
}
|
||||
|
||||
// 加载历史记录
|
||||
// 加载服务监控历史记录
|
||||
var mhs []model.MonitorHistory
|
||||
DB.Where("created_at >= ? AND created_at < ?", today.AddDate(0, 0, -29), today).Find(&mhs)
|
||||
for i := 0; i < len(mhs); i++ {
|
||||
@ -266,6 +273,7 @@ func (ss *ServiceSentinel) LoadStats() map[uint64]*model.ServiceItemResponse {
|
||||
return ss.monthlyStatus
|
||||
}
|
||||
|
||||
// getStateStr 根据服务在线率返回对应的状态字符串
|
||||
func getStateStr(percent uint64) string {
|
||||
if percent == 0 {
|
||||
return "无数据"
|
||||
@ -279,7 +287,9 @@ func getStateStr(percent uint64) string {
|
||||
return "故障"
|
||||
}
|
||||
|
||||
// worker 服务监控的实际工作流程
|
||||
func (ss *ServiceSentinel) worker() {
|
||||
// 从服务状态汇报管道获取汇报的服务数据
|
||||
for r := range ss.serviceReportChannel {
|
||||
if ss.monitors[r.Data.GetId()] == nil || ss.monitors[r.Data.GetId()].ID == 0 {
|
||||
log.Printf("NEZAH>> 错误的服务监控上报 %+v", r)
|
||||
@ -355,7 +365,7 @@ func (ss *ServiceSentinel) worker() {
|
||||
// SSL 证书报警
|
||||
var errMsg string
|
||||
if strings.HasPrefix(mh.Data, "SSL证书错误:") {
|
||||
// 排除 i/o timeont、connection timeout、EOF 错误
|
||||
// 排除 i/o timeout、connection timeout、EOF 错误
|
||||
if !strings.HasSuffix(mh.Data, "timeout") &&
|
||||
!strings.HasSuffix(mh.Data, "EOF") &&
|
||||
!strings.HasSuffix(mh.Data, "timed out") {
|
||||
|
@ -23,14 +23,15 @@ var (
|
||||
DB *gorm.DB
|
||||
Loc *time.Location
|
||||
|
||||
ServerList map[uint64]*model.Server
|
||||
SecretToID map[string]uint64
|
||||
ServerList map[uint64]*model.Server // [ServerID] -> model.Server
|
||||
SecretToID map[string]uint64 // [ServerSecret] -> ServerID
|
||||
ServerLock sync.RWMutex
|
||||
|
||||
SortedServerList []*model.Server
|
||||
SortedServerList []*model.Server // 用于存储服务器列表的 slice,按照服务器 ID 排序
|
||||
SortedServerLock sync.RWMutex
|
||||
)
|
||||
|
||||
// Init 初始化时区为上海时区
|
||||
func Init() {
|
||||
var err error
|
||||
Loc, err = time.LoadLocation("Asia/Shanghai")
|
||||
@ -39,6 +40,7 @@ func Init() {
|
||||
}
|
||||
}
|
||||
|
||||
// ReSortServer 根据服务器ID 对服务器列表进行排序(ID越大越靠前)
|
||||
func ReSortServer() {
|
||||
ServerLock.RLock()
|
||||
defer ServerLock.RUnlock()
|
||||
@ -50,6 +52,7 @@ func ReSortServer() {
|
||||
SortedServerList = append(SortedServerList, s)
|
||||
}
|
||||
|
||||
// 按照服务器 ID 排序的具体实现(ID越大越靠前)
|
||||
sort.SliceStable(SortedServerList, func(i, j int) bool {
|
||||
if SortedServerList[i].DisplayIndex == SortedServerList[j].DisplayIndex {
|
||||
return SortedServerList[i].ID < SortedServerList[j].ID
|
||||
|
Loading…
Reference in New Issue
Block a user