服务监控支持触发任务执行

This commit is contained in:
Akkia 2023-04-15 19:04:38 +08:00
parent 48802cc432
commit e5704157e3
No known key found for this signature in database
GPG Key ID: DABE9A4AB2DD7EF3
7 changed files with 178 additions and 27 deletions

View File

@ -396,18 +396,21 @@ func (ma *memberAPI) addOrEditServer(c *gin.Context) {
} }
type monitorForm struct { type monitorForm struct {
ID uint64 ID uint64
Name string Name string
Target string Target string
Type uint8 Type uint8
Cover uint8 Cover uint8
Notify string Notify string
NotificationTag string NotificationTag string
SkipServersRaw string SkipServersRaw string
Duration uint64 Duration uint64
MinLatency float32 MinLatency float32
MaxLatency float32 MaxLatency float32
LatencyNotify string LatencyNotify string
EnableTriggerTask string
FailTriggerTasksRaw string
RecoverTriggerTasksRaw string
} }
func (ma *memberAPI) addOrEditMonitor(c *gin.Context) { func (ma *memberAPI) addOrEditMonitor(c *gin.Context) {
@ -427,6 +430,9 @@ func (ma *memberAPI) addOrEditMonitor(c *gin.Context) {
m.LatencyNotify = mf.LatencyNotify == "on" m.LatencyNotify = mf.LatencyNotify == "on"
m.MinLatency = mf.MinLatency m.MinLatency = mf.MinLatency
m.MaxLatency = mf.MaxLatency m.MaxLatency = mf.MaxLatency
m.EnableTriggerTask = mf.EnableTriggerTask == "on"
m.RecoverTriggerTasksRaw = mf.RecoverTriggerTasksRaw
m.FailTriggerTasksRaw = mf.FailTriggerTasksRaw
err = m.InitSkipServers() err = m.InitSkipServers()
} }
if err == nil { if err == nil {
@ -434,6 +440,12 @@ func (ma *memberAPI) addOrEditMonitor(c *gin.Context) {
if m.NotificationTag == "" { if m.NotificationTag == "" {
m.NotificationTag = "default" m.NotificationTag = "default"
} }
if err == nil {
err = utils.Json.Unmarshal([]byte(mf.FailTriggerTasksRaw), &m.FailTriggerTasks)
}
if err == nil {
err = utils.Json.Unmarshal([]byte(mf.RecoverTriggerTasksRaw), &m.RecoverTriggerTasks)
}
if m.ID == 0 { if m.ID == 0 {
err = singleton.DB.Create(&m).Error err = singleton.DB.Create(&m).Error
} else { } else {

View File

@ -48,6 +48,12 @@ type Monitor struct {
NotificationTag string // 当前服务监控所属的通知组 NotificationTag string // 当前服务监控所属的通知组
Cover uint8 Cover uint8
EnableTriggerTask bool `gorm:"default: false"`
FailTriggerTasksRaw string `gorm:"default:'[]'"`
RecoverTriggerTasksRaw string `gorm:"default:'[]'"`
FailTriggerTasks []uint64 `gorm:"-" json:"-"` // 失败时执行的触发任务id
RecoverTriggerTasks []uint64 `gorm:"-" json:"-"` // 恢复时执行的触发任务id
MinLatency float32 MinLatency float32
MaxLatency float32 MaxLatency float32
LatencyNotify bool LatencyNotify bool
@ -73,6 +79,21 @@ func (m *Monitor) CronSpec() string {
return fmt.Sprintf("@every %ds", m.Duration) return fmt.Sprintf("@every %ds", m.Duration)
} }
func (m *Monitor) BeforeSave(tx *gorm.DB) error {
if data, err := utils.Json.Marshal(m.FailTriggerTasks); err != nil {
return err
} else {
m.FailTriggerTasksRaw = string(data)
}
if data, err := utils.Json.Marshal(m.RecoverTriggerTasks); err != nil {
return err
} else {
m.RecoverTriggerTasksRaw = string(data)
}
return nil
}
func (m *Monitor) AfterFind(tx *gorm.DB) error { func (m *Monitor) AfterFind(tx *gorm.DB) error {
m.SkipServers = make(map[uint64]bool) m.SkipServers = make(map[uint64]bool)
var skipServers []uint64 var skipServers []uint64
@ -83,6 +104,15 @@ func (m *Monitor) AfterFind(tx *gorm.DB) error {
for i := 0; i < len(skipServers); i++ { for i := 0; i < len(skipServers); i++ {
m.SkipServers[skipServers[i]] = true m.SkipServers[skipServers[i]] = true
} }
// 加载触发任务列表
if err := utils.Json.Unmarshal([]byte(m.FailTriggerTasksRaw), &m.FailTriggerTasks); err != nil {
return err
}
if err := utils.Json.Unmarshal([]byte(m.RecoverTriggerTasksRaw), &m.RecoverTriggerTasks); err != nil {
return err
}
return nil return nil
} }

View File

@ -184,6 +184,9 @@ other = "始终触发"
[ModeOnetimeTrigger] [ModeOnetimeTrigger]
other = "单次触发" other = "单次触发"
[EnableTriggerTask]
other = "启用触发任务"
[FailTriggerTasks] [FailTriggerTasks]
other = "故障时触发任务" other = "故障时触发任务"

View File

@ -330,22 +330,61 @@ function addOrEditMonitor(monitor) {
modal.find("a.ui.label.visible").each((i, el) => { modal.find("a.ui.label.visible").each((i, el) => {
el.remove(); el.remove();
}); });
if (monitor && monitor.EnableTriggerTask) {
modal.find(".ui.nb-EnableTriggerTask.checkbox").checkbox("set checked");
} else {
modal.find(".ui.nb-EnableTriggerTask.checkbox").checkbox("set unchecked");
}
var servers; var servers;
var failTriggerTasks;
var recoverTriggerTasks;
if (monitor) { if (monitor) {
servers = monitor.SkipServersRaw; servers = monitor.SkipServersRaw;
const serverList = JSON.parse(servers || "[]"); const serverList = JSON.parse(servers || "[]");
const node = modal.find("i.dropdown.icon"); const node = modal.find("i.dropdown.icon.specificServer");
for (let i = 0; i < serverList.length; i++) { for (let i = 0; i < serverList.length; i++) {
node.after( node.after(
'<a class="ui label transition visible" data-value="' + '<a class="ui label transition visible" data-value="' +
serverList[i] + serverList[i] +
'" style="display: inline-block !important;">ID:' + '" style="display: inline-block !important;">ID:' +
serverList[i] + serverList[i] +
'<i class="delete icon"></i></a>' '<i class="delete icon"></i></a>'
);
}
failTriggerTasks = monitor.FailTriggerTasksRaw;
recoverTriggerTasks = monitor.RecoverTriggerTasksRaw;
const failTriggerTasksList = JSON.parse(failTriggerTasks || "[]");
const recoverTriggerTasksList = JSON.parse(recoverTriggerTasks || "[]");
const node1 = modal.find("i.dropdown.icon.failTask");
const node2 = modal.find("i.dropdown.icon.recoverTask");
for (let i = 0; i < failTriggerTasksList.length; i++) {
node1.after(
'<a class="ui label transition visible" data-value="' +
failTriggerTasksList[i] +
'" style="display: inline-block !important;">ID:' +
failTriggerTasksList[i] +
'<i class="delete icon"></i></a>'
);
}
for (let i = 0; i < recoverTriggerTasksList.length; i++) {
node2.after(
'<a class="ui label transition visible" data-value="' +
recoverTriggerTasksList[i] +
'" style="display: inline-block !important;">ID:' +
recoverTriggerTasksList[i] +
'<i class="delete icon"></i></a>'
); );
} }
} }
modal modal
.find("input[name=FailTriggerTasksRaw]")
.val(monitor ? "[]," + failTriggerTasks.substr(1, failTriggerTasks.length - 2) : "[]");
modal
.find("input[name=RecoverTriggerTasksRaw]")
.val(monitor ? "[]," + recoverTriggerTasks.substr(1, recoverTriggerTasks.length - 2) : "[]");
modal
.find("input[name=SkipServersRaw]") .find("input[name=SkipServersRaw]")
.val(monitor ? "[]," + servers.substr(1, servers.length - 2) : "[]"); .val(monitor ? "[]," + servers.substr(1, servers.length - 2) : "[]");
showFormModal(".monitor.modal", "#monitorForm", "/api/monitor"); showFormModal(".monitor.modal", "#monitorForm", "/api/monitor");

View File

@ -39,7 +39,7 @@
<label>{{tr "SpecificServers"}}</label> <label>{{tr "SpecificServers"}}</label>
<div class="ui fluid multiple servers search selection dropdown"> <div class="ui fluid multiple servers search selection dropdown">
<input type="hidden" name="SkipServersRaw" /> <input type="hidden" name="SkipServersRaw" />
<i class="dropdown icon"></i> <i class="dropdown icon specificServer"></i>
<div class="default text">{{tr "EnterIdAndNameToSearch"}}</div> <div class="default text">{{tr "EnterIdAndNameToSearch"}}</div>
<div class="menu"></div> <div class="menu"></div>
</div> </div>
@ -68,6 +68,33 @@
<label>{{tr "EnableLatencyNotification"}}</label> <label>{{tr "EnableLatencyNotification"}}</label>
</div> </div>
</div> </div>
<div class="field">
<div class="ui nb-EnableTriggerTask checkbox">
<input name="EnableTriggerTask" type="checkbox" tabindex="0" class="hidden" />
<label>{{tr "EnableTriggerTask"}}</label>
</div>
</div>
<div class="field">
<label>{{tr "FailTriggerTasks"}}</label>
<div class="ui fluid multiple tasks search selection dropdown">
<input type="hidden" name="FailTriggerTasksRaw">
<i class="dropdown icon failTask"></i>
<div class="default text">{{tr "EnterIdAndNameToSearch"}}</div>
<div class="menu"></div>
</div>
</div>
<div class="field">
<label>{{tr "RecoverTriggerTasks"}}</label>
<div class="ui fluid multiple tasks search selection dropdown">
<input type="hidden" name="RecoverTriggerTasksRaw">
<i class="dropdown icon recoverTask"></i>
<div class="default text">{{tr "EnterIdAndNameToSearch"}}</div>
<div class="menu"></div>
</div>
</div>
</form> </form>
<div class="ui warning message"> <div class="ui warning message">
<p> <p>

View File

@ -22,6 +22,9 @@
<th>{{tr "NotificationMethodGroup"}}</th> <th>{{tr "NotificationMethodGroup"}}</th>
<th>{{tr "FailureNotification"}}</th> <th>{{tr "FailureNotification"}}</th>
<th>{{tr "LatencyNotification"}}</th> <th>{{tr "LatencyNotification"}}</th>
<th>{{tr "EnableTriggerTask"}}</th>
<th>{{tr "FailTriggerTasks"}}</th>
<th>{{tr "RecoverTriggerTasks"}}</th>
<th>{{tr "Administration"}}</th> <th>{{tr "Administration"}}</th>
</tr> </tr>
</thead> </thead>
@ -41,6 +44,9 @@
<td>{{$monitor.NotificationTag}}</td> <td>{{$monitor.NotificationTag}}</td>
<td>{{$monitor.Notify}}</td> <td>{{$monitor.Notify}}</td>
<td>{{$monitor.LatencyNotify}}</td> <td>{{$monitor.LatencyNotify}}</td>
<td>{{$monitor.EnableTriggerTask}}</td>
<td>{{$monitor.FailTriggerTasksRaw}}</td>
<td>{{$monitor.RecoverTriggerTasksRaw}}</td>
<td> <td>
<div class="ui mini icon buttons"> <div class="ui mini icon buttons">
<button class="ui button" onclick="addOrEditMonitor({{$monitor}})"> <button class="ui button" onclick="addOrEditMonitor({{$monitor}})">

View File

@ -339,10 +339,12 @@ func (ss *ServiceSentinel) worker() {
// 写入当前数据 // 写入当前数据
ss.serviceCurrentStatusData[mh.GetId()][ss.serviceCurrentStatusIndex[mh.GetId()]] = mh ss.serviceCurrentStatusData[mh.GetId()][ss.serviceCurrentStatusIndex[mh.GetId()]] = mh
ss.serviceCurrentStatusIndex[mh.GetId()]++ ss.serviceCurrentStatusIndex[mh.GetId()]++
// 更新当前状态 // 更新当前状态
ss.serviceResponseDataStoreCurrentUp[mh.GetId()] = 0 ss.serviceResponseDataStoreCurrentUp[mh.GetId()] = 0
ss.serviceResponseDataStoreCurrentDown[mh.GetId()] = 0 ss.serviceResponseDataStoreCurrentDown[mh.GetId()] = 0
ss.serviceResponseDataStoreCurrentAvgDelay[mh.GetId()] = 0 ss.serviceResponseDataStoreCurrentAvgDelay[mh.GetId()] = 0
// 永远是最新的 30 个数据的状态 [01:00, 02:00, 03:00] -> [04:00, 02:00, 03: 00] // 永远是最新的 30 个数据的状态 [01:00, 02:00, 03:00] -> [04:00, 02:00, 03: 00]
for i := 0; i < len(ss.serviceCurrentStatusData[mh.GetId()]); i++ { for i := 0; i < len(ss.serviceCurrentStatusData[mh.GetId()]); i++ {
if ss.serviceCurrentStatusData[mh.GetId()][i].GetId() > 0 { if ss.serviceCurrentStatusData[mh.GetId()][i].GetId() > 0 {
@ -354,11 +356,14 @@ func (ss *ServiceSentinel) worker() {
} }
} }
} }
// 计算在线率,
var upPercent uint64 = 0 var upPercent uint64 = 0
if ss.serviceResponseDataStoreCurrentDown[mh.GetId()]+ss.serviceResponseDataStoreCurrentUp[mh.GetId()] > 0 { if ss.serviceResponseDataStoreCurrentDown[mh.GetId()]+ss.serviceResponseDataStoreCurrentUp[mh.GetId()] > 0 {
upPercent = ss.serviceResponseDataStoreCurrentUp[mh.GetId()] * 100 / (ss.serviceResponseDataStoreCurrentDown[mh.GetId()] + ss.serviceResponseDataStoreCurrentUp[mh.GetId()]) upPercent = ss.serviceResponseDataStoreCurrentUp[mh.GetId()] * 100 / (ss.serviceResponseDataStoreCurrentDown[mh.GetId()] + ss.serviceResponseDataStoreCurrentUp[mh.GetId()])
} }
stateCode := GetStatusCode(upPercent) stateCode := GetStatusCode(upPercent)
// 数据持久化 // 数据持久化
if ss.serviceCurrentStatusIndex[mh.GetId()] == _CurrentStatusSize { if ss.serviceCurrentStatusIndex[mh.GetId()] == _CurrentStatusSize {
ss.serviceCurrentStatusIndex[mh.GetId()] = 0 ss.serviceCurrentStatusIndex[mh.GetId()] = 0
@ -372,34 +377,63 @@ func (ss *ServiceSentinel) worker() {
log.Println("NEZHA>> 服务监控数据持久化失败:", err) log.Println("NEZHA>> 服务监控数据持久化失败:", err)
} }
} }
// 延迟报警 // 延迟报警
if mh.Delay > 0 { if mh.Delay > 0 {
ss.monitorsLock.RLock() ss.monitorsLock.RLock()
if ss.monitors[mh.GetId()].LatencyNotify { if ss.monitors[mh.GetId()].LatencyNotify {
if mh.Delay > ss.monitors[mh.GetId()].MaxLatency { if mh.Delay > ss.monitors[mh.GetId()].MaxLatency {
ServerLock.RLock() ServerLock.RLock()
go SendNotification(ss.monitors[mh.GetId()].NotificationTag, fmt.Sprintf("[Latency] %s %2f > %2f, Reporter: %s", ss.monitors[mh.GetId()].Name, mh.Delay, ss.monitors[mh.GetId()].MaxLatency, ServerList[r.Reporter].Name), NotificationMuteLabel.ServiceLatencyMin(mh.GetId())) reporterServer := ServerList[r.Reporter]
msg := fmt.Sprintf("[Latency] %s %2f > %2f, Reporter: %s", ss.monitors[mh.GetId()].Name, mh.Delay, ss.monitors[mh.GetId()].MaxLatency, reporterServer.Name)
muteLabel := NotificationMuteLabel.ServiceLatencyMin(mh.GetId())
go SendNotification(ss.monitors[mh.GetId()].NotificationTag, msg, muteLabel)
ServerLock.RUnlock() ServerLock.RUnlock()
} }
if mh.Delay < ss.monitors[mh.GetId()].MinLatency { if mh.Delay < ss.monitors[mh.GetId()].MinLatency {
ServerLock.RLock() ServerLock.RLock()
go SendNotification(ss.monitors[mh.GetId()].NotificationTag, fmt.Sprintf("[Latency] %s %2f < %2f, Reporter: %s", ss.monitors[mh.GetId()].Name, mh.Delay, ss.monitors[mh.GetId()].MinLatency, ServerList[r.Reporter].Name), NotificationMuteLabel.ServiceLatencyMax(mh.GetId())) reporterServer := ServerList[r.Reporter]
go SendNotification(ss.monitors[mh.GetId()].NotificationTag, fmt.Sprintf("[Latency] %s %2f < %2f, Reporter: %s", ss.monitors[mh.GetId()].Name, mh.Delay, ss.monitors[mh.GetId()].MinLatency, reporterServer.Name), NotificationMuteLabel.ServiceLatencyMax(mh.GetId()))
ServerLock.RUnlock() ServerLock.RUnlock()
} }
} }
ss.monitorsLock.RUnlock() ss.monitorsLock.RUnlock()
} }
// 故障报警
// 状态变更报警
if stateCode == StatusDown || stateCode != ss.lastStatus[mh.GetId()] { if stateCode == StatusDown || stateCode != ss.lastStatus[mh.GetId()] {
ss.monitorsLock.RLock() ss.monitorsLock.Lock()
isNeedSendNotification := (ss.lastStatus[mh.GetId()] != 0 || stateCode == StatusDown) && ss.monitors[mh.GetId()].Notify lastStatus := ss.lastStatus[mh.GetId()]
// 存储新的状态值
ss.lastStatus[mh.GetId()] = stateCode ss.lastStatus[mh.GetId()] = stateCode
// 判断是否需要发送提醒
isNeedSendNotification := ss.monitors[mh.GetId()].Notify && (lastStatus != 0 || stateCode == StatusDown)
if isNeedSendNotification { if isNeedSendNotification {
ServerLock.RLock() ServerLock.RLock()
go SendNotification(ss.monitors[mh.GetId()].NotificationTag, fmt.Sprintf("[%s] %s Reporter: %s, Error: %s", StatusCodeToString(stateCode), ss.monitors[mh.GetId()].Name, ServerList[r.Reporter].Name, mh.Data), NotificationMuteLabel.ServiceStateChanged(mh.GetId())) reporterServer := ServerList[r.Reporter]
go SendNotification(ss.monitors[mh.GetId()].NotificationTag, fmt.Sprintf("[%s] %s Reporter: %s, Error: %s", StatusCodeToString(stateCode), ss.monitors[mh.GetId()].Name, reporterServer.Name, mh.Data), NotificationMuteLabel.ServiceStateChanged(mh.GetId()))
ServerLock.RUnlock() ServerLock.RUnlock()
} }
ss.monitorsLock.RUnlock()
// 判断是否需要触发任务
if ss.monitors[mh.GetId()].EnableTriggerTask && lastStatus != 0 {
ServerLock.RLock()
reporterServer := ServerList[r.Reporter]
ServerLock.RUnlock()
if stateCode == StatusGood && lastStatus != stateCode {
// 当前状态正常 前序状态异常时 触发恢复任务
go SendTriggerTasks(ss.monitors[mh.GetId()].RecoverTriggerTasks, reporterServer.ID)
} else if lastStatus == StatusGood && lastStatus != stateCode {
// 前序状态正常 当前状态异常时 触发失败任务
go SendTriggerTasks(ss.monitors[mh.GetId()].FailTriggerTasks, reporterServer.ID)
}
}
// 当前状态正常 前序状态非正常时触发恢复任务
ss.monitorsLock.Unlock()
} }
ss.serviceResponseDataStoreLock.Unlock() ss.serviceResponseDataStoreLock.Unlock()
// SSL 证书报警 // SSL 证书报警