服务监控支持触发任务执行

This commit is contained in:
Akkia 2023-04-15 19:04:38 +08:00
parent 48802cc432
commit e5704157e3
No known key found for this signature in database
GPG Key ID: DABE9A4AB2DD7EF3
7 changed files with 178 additions and 27 deletions

View File

@ -396,18 +396,21 @@ func (ma *memberAPI) addOrEditServer(c *gin.Context) {
}
type monitorForm struct {
ID uint64
Name string
Target string
Type uint8
Cover uint8
Notify string
NotificationTag string
SkipServersRaw string
Duration uint64
MinLatency float32
MaxLatency float32
LatencyNotify string
ID uint64
Name string
Target string
Type uint8
Cover uint8
Notify string
NotificationTag string
SkipServersRaw string
Duration uint64
MinLatency float32
MaxLatency float32
LatencyNotify string
EnableTriggerTask string
FailTriggerTasksRaw string
RecoverTriggerTasksRaw string
}
func (ma *memberAPI) addOrEditMonitor(c *gin.Context) {
@ -427,6 +430,9 @@ func (ma *memberAPI) addOrEditMonitor(c *gin.Context) {
m.LatencyNotify = mf.LatencyNotify == "on"
m.MinLatency = mf.MinLatency
m.MaxLatency = mf.MaxLatency
m.EnableTriggerTask = mf.EnableTriggerTask == "on"
m.RecoverTriggerTasksRaw = mf.RecoverTriggerTasksRaw
m.FailTriggerTasksRaw = mf.FailTriggerTasksRaw
err = m.InitSkipServers()
}
if err == nil {
@ -434,6 +440,12 @@ func (ma *memberAPI) addOrEditMonitor(c *gin.Context) {
if m.NotificationTag == "" {
m.NotificationTag = "default"
}
if err == nil {
err = utils.Json.Unmarshal([]byte(mf.FailTriggerTasksRaw), &m.FailTriggerTasks)
}
if err == nil {
err = utils.Json.Unmarshal([]byte(mf.RecoverTriggerTasksRaw), &m.RecoverTriggerTasks)
}
if m.ID == 0 {
err = singleton.DB.Create(&m).Error
} else {

View File

@ -48,6 +48,12 @@ type Monitor struct {
NotificationTag string // 当前服务监控所属的通知组
Cover uint8
EnableTriggerTask bool `gorm:"default: false"`
FailTriggerTasksRaw string `gorm:"default:'[]'"`
RecoverTriggerTasksRaw string `gorm:"default:'[]'"`
FailTriggerTasks []uint64 `gorm:"-" json:"-"` // 失败时执行的触发任务id
RecoverTriggerTasks []uint64 `gorm:"-" json:"-"` // 恢复时执行的触发任务id
MinLatency float32
MaxLatency float32
LatencyNotify bool
@ -73,6 +79,21 @@ func (m *Monitor) CronSpec() string {
return fmt.Sprintf("@every %ds", m.Duration)
}
func (m *Monitor) BeforeSave(tx *gorm.DB) error {
if data, err := utils.Json.Marshal(m.FailTriggerTasks); err != nil {
return err
} else {
m.FailTriggerTasksRaw = string(data)
}
if data, err := utils.Json.Marshal(m.RecoverTriggerTasks); err != nil {
return err
} else {
m.RecoverTriggerTasksRaw = string(data)
}
return nil
}
func (m *Monitor) AfterFind(tx *gorm.DB) error {
m.SkipServers = make(map[uint64]bool)
var skipServers []uint64
@ -83,6 +104,15 @@ func (m *Monitor) AfterFind(tx *gorm.DB) error {
for i := 0; i < len(skipServers); i++ {
m.SkipServers[skipServers[i]] = true
}
// 加载触发任务列表
if err := utils.Json.Unmarshal([]byte(m.FailTriggerTasksRaw), &m.FailTriggerTasks); err != nil {
return err
}
if err := utils.Json.Unmarshal([]byte(m.RecoverTriggerTasksRaw), &m.RecoverTriggerTasks); err != nil {
return err
}
return nil
}

View File

@ -184,6 +184,9 @@ other = "始终触发"
[ModeOnetimeTrigger]
other = "单次触发"
[EnableTriggerTask]
other = "启用触发任务"
[FailTriggerTasks]
other = "故障时触发任务"

View File

@ -330,22 +330,61 @@ function addOrEditMonitor(monitor) {
modal.find("a.ui.label.visible").each((i, el) => {
el.remove();
});
if (monitor && monitor.EnableTriggerTask) {
modal.find(".ui.nb-EnableTriggerTask.checkbox").checkbox("set checked");
} else {
modal.find(".ui.nb-EnableTriggerTask.checkbox").checkbox("set unchecked");
}
var servers;
var failTriggerTasks;
var recoverTriggerTasks;
if (monitor) {
servers = monitor.SkipServersRaw;
const serverList = JSON.parse(servers || "[]");
const node = modal.find("i.dropdown.icon");
const node = modal.find("i.dropdown.icon.specificServer");
for (let i = 0; i < serverList.length; i++) {
node.after(
'<a class="ui label transition visible" data-value="' +
serverList[i] +
'" style="display: inline-block !important;">ID:' +
serverList[i] +
'<i class="delete icon"></i></a>'
'<a class="ui label transition visible" data-value="' +
serverList[i] +
'" style="display: inline-block !important;">ID:' +
serverList[i] +
'<i class="delete icon"></i></a>'
);
}
failTriggerTasks = monitor.FailTriggerTasksRaw;
recoverTriggerTasks = monitor.RecoverTriggerTasksRaw;
const failTriggerTasksList = JSON.parse(failTriggerTasks || "[]");
const recoverTriggerTasksList = JSON.parse(recoverTriggerTasks || "[]");
const node1 = modal.find("i.dropdown.icon.failTask");
const node2 = modal.find("i.dropdown.icon.recoverTask");
for (let i = 0; i < failTriggerTasksList.length; i++) {
node1.after(
'<a class="ui label transition visible" data-value="' +
failTriggerTasksList[i] +
'" style="display: inline-block !important;">ID:' +
failTriggerTasksList[i] +
'<i class="delete icon"></i></a>'
);
}
for (let i = 0; i < recoverTriggerTasksList.length; i++) {
node2.after(
'<a class="ui label transition visible" data-value="' +
recoverTriggerTasksList[i] +
'" style="display: inline-block !important;">ID:' +
recoverTriggerTasksList[i] +
'<i class="delete icon"></i></a>'
);
}
}
modal
modal
.find("input[name=FailTriggerTasksRaw]")
.val(monitor ? "[]," + failTriggerTasks.substr(1, failTriggerTasks.length - 2) : "[]");
modal
.find("input[name=RecoverTriggerTasksRaw]")
.val(monitor ? "[]," + recoverTriggerTasks.substr(1, recoverTriggerTasks.length - 2) : "[]");
modal
.find("input[name=SkipServersRaw]")
.val(monitor ? "[]," + servers.substr(1, servers.length - 2) : "[]");
showFormModal(".monitor.modal", "#monitorForm", "/api/monitor");

View File

@ -39,7 +39,7 @@
<label>{{tr "SpecificServers"}}</label>
<div class="ui fluid multiple servers search selection dropdown">
<input type="hidden" name="SkipServersRaw" />
<i class="dropdown icon"></i>
<i class="dropdown icon specificServer"></i>
<div class="default text">{{tr "EnterIdAndNameToSearch"}}</div>
<div class="menu"></div>
</div>
@ -68,6 +68,33 @@
<label>{{tr "EnableLatencyNotification"}}</label>
</div>
</div>
<div class="field">
<div class="ui nb-EnableTriggerTask checkbox">
<input name="EnableTriggerTask" type="checkbox" tabindex="0" class="hidden" />
<label>{{tr "EnableTriggerTask"}}</label>
</div>
</div>
<div class="field">
<label>{{tr "FailTriggerTasks"}}</label>
<div class="ui fluid multiple tasks search selection dropdown">
<input type="hidden" name="FailTriggerTasksRaw">
<i class="dropdown icon failTask"></i>
<div class="default text">{{tr "EnterIdAndNameToSearch"}}</div>
<div class="menu"></div>
</div>
</div>
<div class="field">
<label>{{tr "RecoverTriggerTasks"}}</label>
<div class="ui fluid multiple tasks search selection dropdown">
<input type="hidden" name="RecoverTriggerTasksRaw">
<i class="dropdown icon recoverTask"></i>
<div class="default text">{{tr "EnterIdAndNameToSearch"}}</div>
<div class="menu"></div>
</div>
</div>
</form>
<div class="ui warning message">
<p>

View File

@ -22,6 +22,9 @@
<th>{{tr "NotificationMethodGroup"}}</th>
<th>{{tr "FailureNotification"}}</th>
<th>{{tr "LatencyNotification"}}</th>
<th>{{tr "EnableTriggerTask"}}</th>
<th>{{tr "FailTriggerTasks"}}</th>
<th>{{tr "RecoverTriggerTasks"}}</th>
<th>{{tr "Administration"}}</th>
</tr>
</thead>
@ -41,6 +44,9 @@
<td>{{$monitor.NotificationTag}}</td>
<td>{{$monitor.Notify}}</td>
<td>{{$monitor.LatencyNotify}}</td>
<td>{{$monitor.EnableTriggerTask}}</td>
<td>{{$monitor.FailTriggerTasksRaw}}</td>
<td>{{$monitor.RecoverTriggerTasksRaw}}</td>
<td>
<div class="ui mini icon buttons">
<button class="ui button" onclick="addOrEditMonitor({{$monitor}})">

View File

@ -339,10 +339,12 @@ func (ss *ServiceSentinel) worker() {
// 写入当前数据
ss.serviceCurrentStatusData[mh.GetId()][ss.serviceCurrentStatusIndex[mh.GetId()]] = mh
ss.serviceCurrentStatusIndex[mh.GetId()]++
// 更新当前状态
ss.serviceResponseDataStoreCurrentUp[mh.GetId()] = 0
ss.serviceResponseDataStoreCurrentDown[mh.GetId()] = 0
ss.serviceResponseDataStoreCurrentAvgDelay[mh.GetId()] = 0
// 永远是最新的 30 个数据的状态 [01:00, 02:00, 03:00] -> [04:00, 02:00, 03: 00]
for i := 0; i < len(ss.serviceCurrentStatusData[mh.GetId()]); i++ {
if ss.serviceCurrentStatusData[mh.GetId()][i].GetId() > 0 {
@ -354,11 +356,14 @@ func (ss *ServiceSentinel) worker() {
}
}
}
// 计算在线率,
var upPercent uint64 = 0
if ss.serviceResponseDataStoreCurrentDown[mh.GetId()]+ss.serviceResponseDataStoreCurrentUp[mh.GetId()] > 0 {
upPercent = ss.serviceResponseDataStoreCurrentUp[mh.GetId()] * 100 / (ss.serviceResponseDataStoreCurrentDown[mh.GetId()] + ss.serviceResponseDataStoreCurrentUp[mh.GetId()])
}
stateCode := GetStatusCode(upPercent)
// 数据持久化
if ss.serviceCurrentStatusIndex[mh.GetId()] == _CurrentStatusSize {
ss.serviceCurrentStatusIndex[mh.GetId()] = 0
@ -372,34 +377,63 @@ func (ss *ServiceSentinel) worker() {
log.Println("NEZHA>> 服务监控数据持久化失败:", err)
}
}
// 延迟报警
if mh.Delay > 0 {
ss.monitorsLock.RLock()
if ss.monitors[mh.GetId()].LatencyNotify {
if mh.Delay > ss.monitors[mh.GetId()].MaxLatency {
ServerLock.RLock()
go SendNotification(ss.monitors[mh.GetId()].NotificationTag, fmt.Sprintf("[Latency] %s %2f > %2f, Reporter: %s", ss.monitors[mh.GetId()].Name, mh.Delay, ss.monitors[mh.GetId()].MaxLatency, ServerList[r.Reporter].Name), NotificationMuteLabel.ServiceLatencyMin(mh.GetId()))
reporterServer := ServerList[r.Reporter]
msg := fmt.Sprintf("[Latency] %s %2f > %2f, Reporter: %s", ss.monitors[mh.GetId()].Name, mh.Delay, ss.monitors[mh.GetId()].MaxLatency, reporterServer.Name)
muteLabel := NotificationMuteLabel.ServiceLatencyMin(mh.GetId())
go SendNotification(ss.monitors[mh.GetId()].NotificationTag, msg, muteLabel)
ServerLock.RUnlock()
}
if mh.Delay < ss.monitors[mh.GetId()].MinLatency {
ServerLock.RLock()
go SendNotification(ss.monitors[mh.GetId()].NotificationTag, fmt.Sprintf("[Latency] %s %2f < %2f, Reporter: %s", ss.monitors[mh.GetId()].Name, mh.Delay, ss.monitors[mh.GetId()].MinLatency, ServerList[r.Reporter].Name), NotificationMuteLabel.ServiceLatencyMax(mh.GetId()))
reporterServer := ServerList[r.Reporter]
go SendNotification(ss.monitors[mh.GetId()].NotificationTag, fmt.Sprintf("[Latency] %s %2f < %2f, Reporter: %s", ss.monitors[mh.GetId()].Name, mh.Delay, ss.monitors[mh.GetId()].MinLatency, reporterServer.Name), NotificationMuteLabel.ServiceLatencyMax(mh.GetId()))
ServerLock.RUnlock()
}
}
ss.monitorsLock.RUnlock()
}
// 故障报警
// 状态变更报警
if stateCode == StatusDown || stateCode != ss.lastStatus[mh.GetId()] {
ss.monitorsLock.RLock()
isNeedSendNotification := (ss.lastStatus[mh.GetId()] != 0 || stateCode == StatusDown) && ss.monitors[mh.GetId()].Notify
ss.monitorsLock.Lock()
lastStatus := ss.lastStatus[mh.GetId()]
// 存储新的状态值
ss.lastStatus[mh.GetId()] = stateCode
// 判断是否需要发送提醒
isNeedSendNotification := ss.monitors[mh.GetId()].Notify && (lastStatus != 0 || stateCode == StatusDown)
if isNeedSendNotification {
ServerLock.RLock()
go SendNotification(ss.monitors[mh.GetId()].NotificationTag, fmt.Sprintf("[%s] %s Reporter: %s, Error: %s", StatusCodeToString(stateCode), ss.monitors[mh.GetId()].Name, ServerList[r.Reporter].Name, mh.Data), NotificationMuteLabel.ServiceStateChanged(mh.GetId()))
reporterServer := ServerList[r.Reporter]
go SendNotification(ss.monitors[mh.GetId()].NotificationTag, fmt.Sprintf("[%s] %s Reporter: %s, Error: %s", StatusCodeToString(stateCode), ss.monitors[mh.GetId()].Name, reporterServer.Name, mh.Data), NotificationMuteLabel.ServiceStateChanged(mh.GetId()))
ServerLock.RUnlock()
}
ss.monitorsLock.RUnlock()
// 判断是否需要触发任务
if ss.monitors[mh.GetId()].EnableTriggerTask && lastStatus != 0 {
ServerLock.RLock()
reporterServer := ServerList[r.Reporter]
ServerLock.RUnlock()
if stateCode == StatusGood && lastStatus != stateCode {
// 当前状态正常 前序状态异常时 触发恢复任务
go SendTriggerTasks(ss.monitors[mh.GetId()].RecoverTriggerTasks, reporterServer.ID)
} else if lastStatus == StatusGood && lastStatus != stateCode {
// 前序状态正常 当前状态异常时 触发失败任务
go SendTriggerTasks(ss.monitors[mh.GetId()].FailTriggerTasks, reporterServer.ID)
}
}
// 当前状态正常 前序状态非正常时触发恢复任务
ss.monitorsLock.Unlock()
}
ss.serviceResponseDataStoreLock.Unlock()
// SSL 证书报警