🔖 dashboard v0.10.0

This commit is contained in:
naiba 2021-09-27 21:18:09 +08:00
parent ac3cfa162c
commit 4249fa82d7
11 changed files with 81 additions and 69 deletions

View File

@ -4,7 +4,7 @@
<br>
<small><i>LOGO designed by <a href="https://xio.ng" target="_blank">熊大</a> .</i></small>
<br><br>
<img src="https://img.shields.io/github/workflow/status/naiba/nezha/Dashboard%20image?label=Dash%20v0.9.34&logo=github&style=for-the-badge">&nbsp;<img src="https://img.shields.io/github/v/release/naiba/nezha?color=brightgreen&label=Agent&style=for-the-badge&logo=github">&nbsp;<img src="https://img.shields.io/github/workflow/status/naiba/nezha/Agent%20release?label=Agent%20CI&logo=github&style=for-the-badge">&nbsp;<img src="https://img.shields.io/badge/Installer-v0.7.0-brightgreen?style=for-the-badge&logo=linux">
<img src="https://img.shields.io/github/workflow/status/naiba/nezha/Dashboard%20image?label=Dash%20v0.10.0&logo=github&style=for-the-badge">&nbsp;<img src="https://img.shields.io/github/v/release/naiba/nezha?color=brightgreen&label=Agent&style=for-the-badge&logo=github">&nbsp;<img src="https://img.shields.io/github/workflow/status/naiba/nezha/Agent%20release?label=Agent%20CI&logo=github&style=for-the-badge">&nbsp;<img src="https://img.shields.io/badge/Installer-v0.7.0-brightgreen?style=for-the-badge&logo=linux">
<br>
<br>
<p>:trollface: <b>哪吒监控</b> 一站式轻监控轻运维系统。支持系统状态、HTTP(SSL 证书变更、即将到期、到期)、TCP、Ping 监控报警,命令批量执行和计划任务。</p>
@ -20,7 +20,7 @@
## 安装脚本
**推荐配置:** 安装前准备 _两个域名_,一个可以 **接入 CDN** 作为 _公开访问_,比如 (status.nai.ba);另外一个解析到面板服务器作为 Agent 连接 Dashboard 使用,**不能接入 CDN** 直接暴露面板主机 IP比如randomdashboard.nai.ba
**推荐配置:** 安装前准备 _两个域名_,一个可以 **接入 CDN** 作为 _公开访问_,比如 (status.nai.ba);另外一个解析到面板服务器作为 Agent 连接 Dashboard 使用,**不能接入 CDN** 直接暴露面板主机 IP比如ip-to-dashboard.nai.ba
```shell
curl -L https://raw.githubusercontent.com/naiba/nezha/master/script/install.sh -o nezha.sh && chmod +x nezha.sh
@ -36,6 +36,13 @@ CN=true sudo ./nezha.sh
_\* 使用 WatchTower 可以自动更新面板Windows 终端可以使用 nssm 配置自启动见尾部教程_
### 特殊技能
编辑 `/etc/systemd/system/nezha-agent.service`,在 `ExecStart=` 这一行的末尾加上
- `--skip-conn` 不监控连接数,机场/连接密集型机器推荐设置不然比较占CPU([shirou/gopsutil/issues#220](https://github.com/shirou/gopsutil/issues/220))
- `--disable-auto-update` 禁止 Agent 自动更新
## 功能说明
<details>
@ -63,7 +70,7 @@ URL 里面也可放置占位符,请求时会进行简单的字符串替换。
- server 酱示例
- 名称server 酱
- URLhttps://sc.ftqq.com/SCUrandomkeys.send?text=#NEZHA#
- URL<https://sc.ftqq.com/SCUrandomkeys.send?text=#NEZHA>#
- 请求方式: GET
- 请求类型: 默认
- Body: 空
@ -71,7 +78,7 @@ URL 里面也可放置占位符,请求时会进行简单的字符串替换。
- wxpusher 示例,需要关注你的应用
- 名称: wxpusher
- URLhttp://wxpusher.zjiecode.com/api/send/message
- URL<http://wxpusher.zjiecode.com/api/send/message>
- 请求方式: POST
- 请求类型: JSON
- Body: `{"appToken":"你的appToken","topicIds":[],"content":"#NEZHA#","contentType":"1","uids":["你的uid"]}`
@ -79,7 +86,7 @@ URL 里面也可放置占位符,请求时会进行简单的字符串替换。
- telegram 示例 [@haitau](https://github.com/haitau) 贡献
- 名称telegram 机器人消息通知
- URLhttps://api.telegram.org/botXXXXXX/sendMessage?chat_id=YYYYYY&text=#NEZHA#
- URL<https://api.telegram.org/botXXXXXX/sendMessage?chat_id=YYYYYY&text=#NEZHA>#
- 请求方式: GET
- 请求类型: 默认
- Body: 空
@ -130,6 +137,7 @@ URL 里面也可放置占位符,请求时会进行简单的字符串替换。
- cycle_interval 小时(可以设为 1 月30\*24
- min/max、cover、ignore 参考基本规则配置
- 示例: 3 号机器的每月 15 号计费的出站月流量 1T 报警 `[{"type":"transfer_out_cycle","max":1000000000000,"cycle_start":"2021-07-15T08:00:00Z","cycle_interval":720,"cover":1,"ignore":{"3":true}}]`
</details>
<details>
@ -221,19 +229,13 @@ URL 里面也可放置占位符,请求时会进行简单的字符串替换。
</details>
<details>
<summary>如何禁用网络连接数监控?</summary>
编辑 `/etc/systemd/system/nezha-agent.service`,在 `ExecStart=` 这一行的末尾加上 `--skip-conn` 就是不监控连接数
</details>
<details>
<summary>Agent 不断重启/无法启动 </summary>
1. 直接执行 `/opt/nezha/agent/nezha-agent -s 面板IP或非CDN域名:面板RPC端口 -p Agent密钥 -d` 查看日志是否是 DNS 问题。
2. `nc -v 域名/IP 面板RPC端口` 或者 `telnet 域名/IP 面板RPC端口` 检验是否是网络问题,检查本机与面板服务器出入站防火墙,如果单机无法判断可借助 https://port.ping.pe/ 提供的端口检查工具进行检测。
2. `nc -v 域名/IP 面板RPC端口` 或者 `telnet 域名/IP 面板RPC端口` 检验是否是网络问题,检查本机与面板服务器出入站防火墙,如果单机无法判断可借助 <https://port.ping.pe/> 提供的端口检查工具进行检测。
3. 如果上面步骤检测正常Agent 正常上线,尝试关闭 SELinux[如何关闭 SELinux](https://www.google.com/search?q=%E5%85%B3%E9%97%ADSELINUX)
</details>
<details>

View File

@ -6,7 +6,6 @@ import (
"errors"
"fmt"
"io"
"log"
"net"
"net/http"
"os"
@ -29,15 +28,22 @@ import (
"github.com/naiba/nezha/service/rpc"
)
type AgentConfig struct {
SkipConnectionCount bool
DisableAutoUpdate bool
Debug bool
Server string
ClientSecret string
}
func init() {
http.DefaultClient.Timeout = time.Second * 5
flag.CommandLine.ParseErrorsWhitelist.UnknownFlags = true
}
var (
server, clientSecret, version string
debug bool
stateConf monitor.GetStateConfig
version string
agentConf AgentConfig
)
var (
@ -61,13 +67,14 @@ func main() {
// 来自于 GoReleaser 的版本号
monitor.Version = version
flag.BoolVarP(&debug, "debug", "d", false, "开启调试信息")
flag.StringVarP(&server, "*server", "s", "localhost:5555", "管理面板RPC端口")
flag.StringVarP(&clientSecret, "password", "p", "", "Agent连接Secret")
flag.BoolVar(&stateConf.SkipConnectionCount, "skip-conn", false, "不监控连接数")
flag.BoolVarP(&agentConf.Debug, "debug", "d", false, "开启调试信息")
flag.StringVarP(&agentConf.Server, "*server", "s", "localhost:5555", "管理面板RPC端口")
flag.StringVarP(&agentConf.ClientSecret, "password", "p", "", "Agent连接Secret")
flag.BoolVar(&agentConf.SkipConnectionCount, "skip-conn", false, "不监控连接数")
flag.BoolVar(&agentConf.DisableAutoUpdate, "disable-auto-update", false, "禁用自动升级")
flag.Parse()
if clientSecret == "" {
if agentConf.ClientSecret == "" {
flag.Usage()
return
}
@ -77,7 +84,7 @@ func main() {
func run() {
auth := rpc.AuthHandler{
ClientSecret: clientSecret,
ClientSecret: agentConf.ClientSecret,
}
go pty.DownloadDependency()
@ -87,7 +94,7 @@ func run() {
// 更新IP信息
go monitor.UpdateIP()
if _, err := semver.Parse(version); err == nil {
if _, err := semver.Parse(version); err == nil && !agentConf.DisableAutoUpdate {
go func() {
for range updateCh {
go doSelfUpdate()
@ -111,7 +118,7 @@ func run() {
for {
timeOutCtx, cancel := context.WithTimeout(context.Background(), networkTimeOut)
conn, err = grpc.DialContext(timeOutCtx, server, grpc.WithInsecure(), grpc.WithPerRPCCredentials(&auth))
conn, err = grpc.DialContext(timeOutCtx, agentConf.Server, grpc.WithInsecure(), grpc.WithPerRPCCredentials(&auth))
if err != nil {
println("与面板建立连接失败:", err)
cancel()
@ -194,7 +201,7 @@ func reportState() {
if client != nil && inited {
monitor.TrackNetworkSpeed()
timeOutCtx, cancel := context.WithTimeout(context.Background(), networkTimeOut)
_, err = client.ReportSystemState(timeOutCtx, monitor.GetState(stateConf).PB())
_, err = client.ReportSystemState(timeOutCtx, monitor.GetState(agentConf.SkipConnectionCount).PB())
cancel()
if err != nil {
println("reportState error", err)
@ -334,7 +341,7 @@ func handleTerminalTask(task *pb.Task) {
protocol += "s"
}
header := http.Header{}
header.Add("Secret", clientSecret)
header.Add("Secret", agentConf.ClientSecret)
conn, _, err := websocket.DefaultDialer.Dial(fmt.Sprintf("%s://%s/terminal/%s", protocol, terminal.Host, terminal.Session), header)
if err != nil {
println("Terminal 连接失败:", err)
@ -404,7 +411,8 @@ func handleTerminalTask(task *pb.Task) {
}
func println(v ...interface{}) {
if debug {
log.Println(v...)
if agentConf.Debug {
fmt.Printf("NEZHA@%s>> ", time.Now().Format("2006-01-02 15:04:05"))
fmt.Println(v...)
}
}

View File

@ -74,11 +74,7 @@ func GetHost() *model.Host {
}
}
type GetStateConfig struct {
SkipConnectionCount bool
}
func GetState(conf GetStateConfig) *model.HostState {
func GetState(skipConnectionCount bool) *model.HostState {
hi, _ := host.Info()
mv, _ := mem.VirtualMemory()
@ -101,7 +97,7 @@ func GetState(conf GetStateConfig) *model.HostState {
var tcpConnCount, udpConnCount uint64
if !conf.SkipConnectionCount {
if !skipConnectionCount {
conns, _ := net.Connections("all")
for i := 0; i < len(conns); i++ {
switch conns[i].Type {

View File

@ -271,8 +271,8 @@ func (cp *commonPage) terminal(c *gin.Context) {
}
defer conn.Close()
log.Println("terminal connected", isAgent, c.Request.URL)
defer log.Println("terminal disconnected", isAgent, c.Request.URL)
log.Println("NEZHA>> terminal connected", isAgent, c.Request.URL)
defer log.Println("NEZHA>> terminal disconnected", isAgent, c.Request.URL)
if isAgent {
terminal.agentConn = conn

View File

@ -198,7 +198,7 @@ func main() {
}
}
if err := dao.DB.Create(txs).Error; err != nil {
log.Println("流量统计入库", err)
log.Println("NEZHA>> 流量统计入库", err)
}
srv.Shutdown(c)
return nil

View File

@ -73,3 +73,7 @@ func (m *Monitor) AfterFind(tx *gorm.DB) error {
}
return nil
}
func IsServiceSentinelNeeded(t uint64) bool {
return t == TaskTypeCommand || t == TaskTypeTerminal
}

View File

@ -54,7 +54,7 @@ func AlertSentinelStart() {
checkCount++
if lastPrint.Before(startedAt.Add(-1 * time.Hour)) {
if Conf.Debug {
log.Println("报警规则检测每小时", checkCount, "次", startedAt, time.Now())
log.Println("NEZHA>> 报警规则检测每小时", checkCount, "次", startedAt, time.Now())
}
checkCount = 0
lastPrint = startedAt
@ -114,11 +114,11 @@ func checkStatus() {
max, passed := alert.Check(alertsStore[alert.ID][server.ID])
if !passed {
alertsPrevState[alert.ID][server.ID] = _RuleCheckFail
message := fmt.Sprintf("报警规则:%s服务器%s(%s),逮到咯,快去看看!", alert.Name, server.Name, utils.IPDesensitize(server.Host.IP))
message := fmt.Sprintf("[主机故障] %s(%s) 规则:%s", server.Name, utils.IPDesensitize(server.Host.IP), alert.Name)
go SendNotification(message, true)
} else {
if alertsPrevState[alert.ID][server.ID] == _RuleCheckFail {
message := fmt.Sprintf("报警规则:%s服务器%s(%s),已恢复正常", alert.Name, server.Name, utils.IPDesensitize(server.Host.IP))
message := fmt.Sprintf("[主机恢复] %s(%s) 规则:%s", server.Name, utils.IPDesensitize(server.Host.IP), alert.Name)
go SendNotification(message, true)
}
alertsPrevState[alert.ID][server.ID] = _RuleCheckPass

View File

@ -13,7 +13,7 @@ import (
pb "github.com/naiba/nezha/proto"
)
var Version = "v0.9.34" // !!记得修改 README 中的 badge 版本!!
var Version = "v0.10.0" // !!记得修改 README 中的 badge 版本!!
var (
Conf *model.Config
@ -64,7 +64,7 @@ func ManualTrigger(c *model.Cron) {
Type: model.TaskTypeCommand,
})
} else {
SendNotification(fmt.Sprintf("计划任务:%s服务器%s 离线,无法执行。", c.Name, ServerList[c.Servers[j]].Name), false)
SendNotification(fmt.Sprintf("[任务失败] %s服务器 %s 离线,无法执行。", c.Name, ServerList[c.Servers[j]].Name), false)
}
}
}
@ -91,7 +91,7 @@ func CronTrigger(cr model.Cron) func() {
Type: model.TaskTypeCommand,
})
} else {
SendNotification(fmt.Sprintf("计划任务:%s服务器%s 离线,无法执行。", cr.Name, s.Name), false)
SendNotification(fmt.Sprintf("[任务失败] %s服务器 %s 离线,无法执行。", cr.Name, s.Name), false)
}
}
}

View File

@ -71,7 +71,7 @@ func SendNotification(desc string, muteable bool) {
if !flag {
if Conf.Debug {
log.Println("静音的重复通知:", desc, muteable)
log.Println("NEZHA>> 静音的重复通知:", desc, muteable)
}
return
}
@ -81,7 +81,7 @@ func SendNotification(desc string, muteable bool) {
defer notificationsLock.RUnlock()
for i := 0; i < len(notifications); i++ {
if err := notifications[i].Send(desc); err != nil {
log.Println("发送通知失败:", err)
log.Println("NEZHA>> 发送通知失败:", err)
}
}
}

View File

@ -150,6 +150,7 @@ func (ss *ServiceSentinel) loadMonitorHistory() {
monitors[i].CronJobID, err = ss.dispatchCron.AddFunc(task.CronSpec(), func() {
ss.dispatchBus <- task
})
log.Println("NEZHA>> 服务监控任务", monitors[i].ID, monitors[i].Name, monitors[i].CronJobID)
if err != nil {
panic(err)
}
@ -282,7 +283,8 @@ func getStateStr(percent uint64) string {
func (ss *ServiceSentinel) worker() {
for r := range ss.serviceReportChannel {
if ss.monitors[r.Data.GetId()].ID == 0 {
if ss.monitors[r.Data.GetId()] == nil || ss.monitors[r.Data.GetId()].ID == 0 {
log.Printf("NEZAH>> 错误的服务监控上报 %+v", r)
continue
}
mh := model.PB2MonitorHistory(r.Data)
@ -315,7 +317,7 @@ func (ss *ServiceSentinel) worker() {
ss.serviceCurrentStatusIndex[mh.MonitorID] = 0
dataToSave := ss.serviceCurrentStatusData[mh.MonitorID]
if err := DB.Create(&dataToSave).Error; err != nil {
log.Println("服务监控数据持久化失败:", err)
log.Println("NEZHA>> 服务监控数据持久化失败:", err)
}
}
// 更新当前状态
@ -337,7 +339,7 @@ func (ss *ServiceSentinel) worker() {
stateStr := getStateStr(upPercent)
if !mh.Successful {
ServerLock.RLock()
log.Println("服务故障上报:", ss.monitors[mh.MonitorID].Target, stateStr, "上报者:", ServerList[r.Reporter].Name, "请求输出:", mh.Data)
log.Println("NEZHA>> 服务故障上报:", ss.monitors[mh.MonitorID].Target, stateStr, "上报者:", ServerList[r.Reporter].Name, "请求输出:", mh.Data)
ServerLock.RUnlock()
}
if stateStr == "故障" || stateStr != ss.lastStatus[mh.MonitorID] {
@ -345,7 +347,7 @@ func (ss *ServiceSentinel) worker() {
isNeedSendNotification := (ss.lastStatus[mh.MonitorID] != "" || stateStr == "故障") && ss.monitors[mh.MonitorID].Notify
ss.lastStatus[mh.MonitorID] = stateStr
if isNeedSendNotification {
go SendNotification(fmt.Sprintf("服务监控:%s 服务状态:%s", ss.monitors[mh.MonitorID].Name, stateStr), true)
go SendNotification(fmt.Sprintf("[服务%s] %s", stateStr, ss.monitors[mh.MonitorID].Name), true)
}
ss.monitorsLock.RUnlock()
}
@ -389,7 +391,7 @@ func (ss *ServiceSentinel) worker() {
if errMsg != "" {
ss.monitorsLock.RLock()
if ss.monitors[mh.MonitorID].Notify {
go SendNotification(fmt.Sprintf("服务监控:%s %s", ss.monitors[mh.MonitorID].Name, errMsg), true)
go SendNotification(fmt.Sprintf("[SSL] %s %s", ss.monitors[mh.MonitorID].Name, errMsg), true)
}
ss.monitorsLock.RUnlock()
}

View File

@ -21,12 +21,7 @@ func (s *NezhaHandler) ReportTask(c context.Context, r *pb.TaskResult) (*pb.Rece
if clientID, err = s.Auth.Check(c); err != nil {
return nil, err
}
if r.GetType() != model.TaskTypeCommand {
dao.ServiceSentinelShared.Dispatch(dao.ReportData{
Data: r,
Reporter: clientID,
})
} else {
if r.GetType() == model.TaskTypeCommand {
// 处理上报的计划任务
dao.CronLock.RLock()
defer dao.CronLock.RUnlock()
@ -35,16 +30,21 @@ func (s *NezhaHandler) ReportTask(c context.Context, r *pb.TaskResult) (*pb.Rece
dao.ServerLock.RLock()
defer dao.ServerLock.RUnlock()
if cr.PushSuccessful && r.GetSuccessful() {
dao.SendNotification(fmt.Sprintf("成功计划任务:%s ,服务器:%s日志\n%s", cr.Name, dao.ServerList[clientID].Name, r.GetData()), false)
dao.SendNotification(fmt.Sprintf("[任务成功] %s ,服务器:%s日志\n%s", cr.Name, dao.ServerList[clientID].Name, r.GetData()), false)
}
if !r.GetSuccessful() {
dao.SendNotification(fmt.Sprintf("失败计划任务:%s ,服务器:%s日志\n%s", cr.Name, dao.ServerList[clientID].Name, r.GetData()), false)
dao.SendNotification(fmt.Sprintf("[任务失败] %s ,服务器:%s日志\n%s", cr.Name, dao.ServerList[clientID].Name, r.GetData()), false)
}
dao.DB.Model(cr).Updates(model.Cron{
LastExecutedAt: time.Now().Add(time.Second * -1 * time.Duration(r.GetDelay())),
LastResult: r.GetSuccessful(),
})
}
} else if model.IsServiceSentinelNeeded(r.GetType()) {
dao.ServiceSentinelShared.Dispatch(dao.ReportData{
Data: r,
Reporter: clientID,
})
}
return &pb.Receipt{Proced: true}, nil
}
@ -101,7 +101,7 @@ func (s *NezhaHandler) ReportSystemInfo(c context.Context, r *pb.Host) (*pb.Rece
host.IP != "" &&
dao.ServerList[clientID].Host.IP != host.IP {
dao.SendNotification(fmt.Sprintf(
"IP变更提醒 服务器:%s 旧IP%s新IP%s。",
"[IP变更] %s 旧IP%s新IP%s。",
dao.ServerList[clientID].Name, utils.IPDesensitize(dao.ServerList[clientID].Host.IP), utils.IPDesensitize(host.IP)), true)
}