dashboard: 服务监控请求时间间隔

This commit is contained in:
naiba 2021-09-02 23:45:21 +08:00
parent 0ea21598e8
commit 446ab3b1b8
12 changed files with 119 additions and 50 deletions

View File

@ -4,7 +4,7 @@
<br>
<small><i>LOGO designed by <a href="https://xio.ng" target="_blank">熊大</a> .</i></small>
<br><br>
<img src="https://img.shields.io/github/workflow/status/naiba/nezha/Dashboard%20image?label=Dash%20v0.9.32&logo=github&style=for-the-badge">&nbsp;<img src="https://img.shields.io/github/v/release/naiba/nezha?color=brightgreen&label=Agent&style=for-the-badge&logo=github">&nbsp;<img src="https://img.shields.io/github/workflow/status/naiba/nezha/Agent%20release?label=Agent%20CI&logo=github&style=for-the-badge">&nbsp;<img src="https://img.shields.io/badge/Installer-v0.7.0-brightgreen?style=for-the-badge&logo=linux">
<img src="https://img.shields.io/github/workflow/status/naiba/nezha/Dashboard%20image?label=Dash%20v0.9.33&logo=github&style=for-the-badge">&nbsp;<img src="https://img.shields.io/github/v/release/naiba/nezha?color=brightgreen&label=Agent&style=for-the-badge&logo=github">&nbsp;<img src="https://img.shields.io/github/workflow/status/naiba/nezha/Agent%20release?label=Agent%20CI&logo=github&style=for-the-badge">&nbsp;<img src="https://img.shields.io/badge/Installer-v0.7.0-brightgreen?style=for-the-badge&logo=linux">
<br>
<br>
<p>:trollface: <b>哪吒监控</b> 一站式轻监控轻运维系统。支持系统状态、HTTP(SSL 证书变更、即将到期、到期)、TCP、Ping 监控报警,命令批量执行和计划任务。</p>

View File

@ -204,6 +204,7 @@ type monitorForm struct {
Cover uint8
Notify string
SkipServersRaw string
Duration uint64
}
func (ma *memberAPI) addOrEditMonitor(c *gin.Context) {
@ -218,6 +219,7 @@ func (ma *memberAPI) addOrEditMonitor(c *gin.Context) {
m.SkipServersRaw = mf.SkipServersRaw
m.Cover = mf.Cover
m.Notify = mf.Notify == "on"
m.Duration = mf.Duration
}
if err == nil {
if m.ID == 0 {
@ -226,14 +228,15 @@ func (ma *memberAPI) addOrEditMonitor(c *gin.Context) {
err = dao.DB.Save(&m).Error
}
}
if err == nil {
err = dao.ServiceSentinelShared.OnMonitorUpdate(m)
}
if err != nil {
c.JSON(http.StatusOK, model.Response{
Code: http.StatusBadRequest,
Message: fmt.Sprintf("请求错误:%s", err),
})
return
} else {
dao.ServiceSentinelShared.OnMonitorUpdate()
}
c.JSON(http.StatusOK, model.Response{
Code: http.StatusOK,

View File

@ -17,7 +17,11 @@ import (
"github.com/naiba/nezha/service/dao"
)
var serviceSentinelDispatchBus chan model.Monitor
func init() {
serviceSentinelDispatchBus = make(chan model.Monitor)
shanghai, err := time.LoadLocation("Asia/Shanghai")
if err != nil {
panic(err)
@ -55,7 +59,7 @@ func initSystem() {
dao.DB.AutoMigrate(model.Server{}, model.User{},
model.Notification{}, model.AlertRule{}, model.Monitor{},
model.MonitorHistory{}, model.Cron{}, model.Transfer{})
dao.NewServiceSentinel()
dao.NewServiceSentinel(serviceSentinelDispatchBus)
loadServers() //加载服务器列表
loadCrons() //加载计划任务
@ -65,6 +69,7 @@ func initSystem() {
if err != nil {
panic(err)
}
// 流量记录打点
_, err = dao.Cron.AddFunc("0 * * * *", recordTransferHourlyUsage)
if err != nil {
@ -173,7 +178,7 @@ func loadCrons() {
func main() {
cleanMonitorHistory()
go rpc.ServeRPC(dao.Conf.GRPCPort)
go rpc.DispatchTask(time.Second * 30)
go rpc.DispatchTask(serviceSentinelDispatchBus)
go dao.AlertSentinelStart()
srv := controller.ServeWeb(dao.Conf.HTTPPort)
graceful.Graceful(func() error {

View File

@ -3,7 +3,6 @@ package rpc
import (
"fmt"
"net"
"time"
"google.golang.org/grpc"
@ -25,41 +24,36 @@ func ServeRPC(port uint) {
server.Serve(listen)
}
func DispatchTask(duration time.Duration) {
var index uint64 = 0
for {
var hasAliveAgent bool
tasks := dao.ServiceSentinelShared.Monitors()
func DispatchTask(serviceSentinelDispatchBus <-chan model.Monitor) {
workedServerIndex := 0
for task := range serviceSentinelDispatchBus {
round := 0
prevIndex := workedServerIndex
dao.SortedServerLock.RLock()
startedAt := time.Now()
for i := 0; i < len(tasks); i++ {
if index >= uint64(len(dao.SortedServerList)) {
index = 0
if !hasAliveAgent {
break
}
hasAliveAgent = false
}
// 1. 如果服务器不在线,跳过这个服务器
if dao.SortedServerList[index].TaskStream == nil {
i--
index++
// 如果已经轮了一整圈没有合适机器去请求,跳出循环
for round == 0 && prevIndex != workedServerIndex {
// 如果到了圈尾,再回到圈头,圈数加一,游标重置
if workedServerIndex == len(dao.SortedServerList) {
workedServerIndex = 0
round++
continue
}
// 2. 如果此任务不可使用此服务器请求,跳过这个服务器(有些 IPv6 only 开了 NAT64 的机器请求 IPv4 总会出问题)
if (tasks[i].Cover == model.MonitorCoverAll && tasks[i].SkipServers[dao.SortedServerList[index].ID]) ||
(tasks[i].Cover == model.MonitorCoverIgnoreAll && !tasks[i].SkipServers[dao.SortedServerList[index].ID]) {
i--
index++
// 如果服务器不在线,跳过这个服务器
if dao.SortedServerList[workedServerIndex].TaskStream == nil {
workedServerIndex++
continue
}
hasAliveAgent = true
dao.SortedServerList[index].TaskStream.Send(tasks[i].PB())
index++
// 如果此任务不可使用此服务器请求,跳过这个服务器(有些 IPv6 only 开了 NAT64 的机器请求 IPv4 总会出问题)
if (task.Cover == model.MonitorCoverAll && task.SkipServers[dao.SortedServerList[workedServerIndex].ID]) ||
(task.Cover == model.MonitorCoverIgnoreAll && !task.SkipServers[dao.SortedServerList[workedServerIndex].ID]) {
workedServerIndex++
continue
}
// 找到合适机器执行任务,跳出循环
dao.SortedServerList[workedServerIndex].TaskStream.Send(task.PB())
workedServerIndex++
break
}
dao.SortedServerLock.RUnlock()
time.Sleep(time.Until(startedAt.Add(duration)))
}
}

View File

@ -1,7 +1,7 @@
package model
type ServiceItemResponse struct {
Monitor Monitor
Monitor *Monitor
TotalUp uint64
TotalDown uint64
CurrentUp uint64

View File

@ -2,8 +2,10 @@ package model
import (
"encoding/json"
"fmt"
pb "github.com/naiba/nezha/proto"
"github.com/robfig/cron/v3"
"gorm.io/gorm"
)
@ -36,9 +38,12 @@ type Monitor struct {
Type uint8
Target string
SkipServersRaw string
Duration uint64
Notify bool
Cover uint8
SkipServers map[uint64]bool `gorm:"-" json:"-"`
SkipServers map[uint64]bool `gorm:"-" json:"-"`
CronJobID cron.EntryID `gorm:"-" json:"-"`
}
func (m *Monitor) PB() *pb.Task {
@ -49,6 +54,14 @@ func (m *Monitor) PB() *pb.Task {
}
}
func (m *Monitor) CronSpec() string {
if m.Duration == 0 {
// 默认间隔 30 秒
m.Duration = 30
}
return fmt.Sprintf("@every %ds", m.Duration)
}
func (m *Monitor) AfterFind(tx *gorm.DB) error {
var skipServers []uint64
if err := json.Unmarshal([]byte(m.SkipServersRaw), &skipServers); err != nil {

View File

@ -55,7 +55,8 @@ function showFormModal(modelSelector, formID, URL, getData) {
item.name === "RequestMethod" ||
item.name === "DisplayIndex" ||
item.name === "Type" ||
item.name === "Cover"
item.name === "Cover" ||
item.name === "Duration"
) {
obj[item.name] = parseInt(item.value);
} else {
@ -218,6 +219,7 @@ function addOrEditMonitor(monitor) {
modal.find("input[name=ID]").val(monitor ? monitor.ID : null);
modal.find("input[name=Name]").val(monitor ? monitor.Name : null);
modal.find("input[name=Target]").val(monitor ? monitor.Target : null);
modal.find("input[name=Duration]").val(monitor && monitor.Duration ? monitor.Duration : 30);
modal.find("select[name=Type]").val(monitor ? monitor.Type : 1);
modal.find("select[name=Cover]").val(monitor ? monitor.Cover : 0);
if (monitor && monitor.Notify) {

View File

@ -9,7 +9,7 @@
<script src="https://cdn.jsdelivr.net/npm/semantic-ui@2.4.1/dist/semantic.min.js"></script>
<script src="/static/semantic-ui-alerts.min.js"></script>
<script src="https://cdn.jsdelivr.net/npm/vue@2.6.12/dist/vue.min.js"></script>
<script src="/static/main.js?v20210819"></script>
<script src="/static/main.js?v20210902"></script>
</body>
</html>

View File

@ -24,6 +24,10 @@
<option value="3">TCP-Ping</option>
</select>
</div>
<div class="field">
<label>请求间隔</label>
<input type="number" name="Duration" placeholder="秒" />
</div>
<div class="field">
<label>覆盖范围</label>
<select name="Cover" class="ui fluid dropdown">

View File

@ -18,6 +18,7 @@
<th>覆盖范围</th>
<th>特定服务器</th>
<th>类型</th>
<th>请求间隔</th>
<th>通知</th>
<th>管理</th>
</tr>
@ -34,6 +35,7 @@
{{if eq $monitor.Type 1}}HTTP(S)/SSL证书 {{else if eq $monitor.Type
2}} ICMP Ping {{else}} TCP 端口 {{end}}
</td>
<td>{{$monitor.Duration}}秒</td>
<td>{{$monitor.Notify}}</td>
<td>
<div class="ui mini icon buttons">

View File

@ -13,7 +13,7 @@ import (
pb "github.com/naiba/nezha/proto"
)
var Version = "v0.9.32" // !!记得修改 README 中的 badge 版本!!
var Version = "v0.9.33" // !!记得修改 README 中的 badge 版本!!
var (
Conf *model.Config

View File

@ -10,6 +10,7 @@ import (
"github.com/naiba/nezha/model"
pb "github.com/naiba/nezha/proto"
"github.com/robfig/cron/v3"
)
const _CurrentStatusSize = 30 // 统计 15 分钟内的数据为当前状态
@ -27,7 +28,7 @@ type _TodayStatsOfMonitor struct {
Delay float32
}
func NewServiceSentinel() {
func NewServiceSentinel(serviceSentinelDispatchBus chan<- model.Monitor) {
ServiceSentinelShared = &ServiceSentinel{
serviceReportChannel: make(chan ReportData, 200),
serviceStatusToday: make(map[uint64]*_TodayStatsOfMonitor),
@ -37,12 +38,15 @@ func NewServiceSentinel() {
lastStatus: make(map[uint64]string),
serviceResponseDataStoreCurrentUp: make(map[uint64]uint64),
serviceResponseDataStoreCurrentDown: make(map[uint64]uint64),
monitors: make(map[uint64]model.Monitor),
monitors: make(map[uint64]*model.Monitor),
sslCertCache: make(map[uint64]string),
// 30天数据缓存
monthlyStatus: make(map[uint64]*model.ServiceItemResponse),
dispatchCron: cron.New(cron.WithSeconds()),
dispatchBus: serviceSentinelDispatchBus,
}
ServiceSentinelShared.OnMonitorUpdate()
ServiceSentinelShared.loadMonitorHistory()
ServiceSentinelShared.dispatchCron.Start()
year, month, day := time.Now().Date()
today := time.Date(year, month, day, 0, 0, 0, 0, time.Local)
@ -92,11 +96,14 @@ type ServiceSentinel struct {
lastStatus map[uint64]string
serviceResponseDataStoreCurrentUp map[uint64]uint64
serviceResponseDataStoreCurrentDown map[uint64]uint64
monitors map[uint64]model.Monitor
monitors map[uint64]*model.Monitor
sslCertCache map[uint64]string
// 30天数据缓存
monthlyStatusLock sync.Mutex
monthlyStatus map[uint64]*model.ServiceItemResponse
// 服务监控调度计划任务
dispatchCron *cron.Cron
dispatchBus chan<- model.Monitor
}
func (ss *ServiceSentinel) refreshMonthlyServiceStatus() {
@ -118,10 +125,10 @@ func (ss *ServiceSentinel) Dispatch(r ReportData) {
ss.serviceReportChannel <- r
}
func (ss *ServiceSentinel) Monitors() []model.Monitor {
func (ss *ServiceSentinel) Monitors() []*model.Monitor {
ss.monitorsLock.RLock()
defer ss.monitorsLock.RUnlock()
var monitors []model.Monitor
var monitors []*model.Monitor
for _, v := range ss.monitors {
monitors = append(monitors, v)
}
@ -131,14 +138,21 @@ func (ss *ServiceSentinel) Monitors() []model.Monitor {
return monitors
}
func (ss *ServiceSentinel) OnMonitorUpdate() {
var monitors []model.Monitor
func (ss *ServiceSentinel) loadMonitorHistory() {
var monitors []*model.Monitor
DB.Find(&monitors)
var err error
ss.monitorsLock.Lock()
defer ss.monitorsLock.Unlock()
ss.monitors = make(map[uint64]model.Monitor)
ss.monitors = make(map[uint64]*model.Monitor)
for i := 0; i < len(monitors); i++ {
task := *monitors[i]
monitors[i].CronJobID, err = ss.dispatchCron.AddFunc(task.CronSpec(), func() {
ss.dispatchBus <- task
})
if err != nil {
panic(err)
}
ss.monitors[monitors[i].ID] = monitors[i]
if len(ss.serviceCurrentStatusData[monitors[i].ID]) == 0 {
ss.serviceCurrentStatusData[monitors[i].ID] = make([]model.MonitorHistory, _CurrentStatusSize)
@ -178,6 +192,36 @@ func (ss *ServiceSentinel) OnMonitorUpdate() {
}
}
func (ss *ServiceSentinel) OnMonitorUpdate(m model.Monitor) error {
ss.monitorsLock.Lock()
defer ss.monitorsLock.Unlock()
var err error
// 写入新任务
m.CronJobID, err = ss.dispatchCron.AddFunc(m.CronSpec(), func() {
ss.dispatchBus <- m
})
if err != nil {
return err
}
if ss.monitors[m.ID] != nil {
// 停掉旧任务
ss.dispatchCron.Remove(ss.monitors[m.ID].CronJobID)
} else {
// 新任务初始化数据
ss.monthlyStatusLock.Lock()
defer ss.monthlyStatusLock.Unlock()
ss.monthlyStatus[m.ID] = &model.ServiceItemResponse{
Monitor: &m,
Delay: &[30]float32{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
Up: &[30]int{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
Down: &[30]int{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
}
}
// 更新这个任务
ss.monitors[m.ID] = &m
return nil
}
func (ss *ServiceSentinel) OnMonitorDelete(id uint64) {
ss.serviceResponseDataStoreLock.Lock()
defer ss.serviceResponseDataStoreLock.Unlock()
@ -190,6 +234,8 @@ func (ss *ServiceSentinel) OnMonitorDelete(id uint64) {
delete(ss.sslCertCache, id)
ss.monitorsLock.Lock()
defer ss.monitorsLock.Unlock()
// 停掉定时任务
ss.dispatchCron.Remove(ss.monitors[id].CronJobID)
delete(ss.monitors, id)
ss.monthlyStatusLock.Lock()
defer ss.monthlyStatusLock.Unlock()