diff --git a/README.md b/README.md index 7bbd64e..99b643c 100644 --- a/README.md +++ b/README.md @@ -1,13 +1,13 @@ -
+


LOGO designed by 熊大 .

-    +   

-

:trollface: 哪吒监控 一站式轻监控轻运维系统。支持系统状态、HTTP(SSL 证书变更、即将到期、到期)、TCP、Ping 监控报警,命令批量执行和计划任务。

+

:trollface: 哪吒监控 一站式轻监控轻运维系统。支持系统状态、HTTP(SSL 证书变更、即将到期、到期)、TCP、Ping 监控报警,命令批量执行和计划任务。

\>> QQ 交流群:872069346 **加群要求:已搭建好哪吒监控 & 有 2+ 服务器** @@ -20,7 +20,7 @@ ## 安装脚本 -**推荐配置:** 安装前准备 _两个域名_,一个可以 **接入 CDN** 作为 _公开访问_,比如 (status.nai.ba);另外一个解析到面板服务器作为 Agent 连接 Dashboard 使用,**不能接入 CDN** 直接暴露面板主机 IP,比如(randomdashboard.nai.ba)。 +**推荐配置:** 安装前准备 _两个域名_,一个可以 **接入 CDN** 作为 _公开访问_,比如 (status.nai.ba);另外一个解析到面板服务器作为 Agent 连接 Dashboard 使用,**不能接入 CDN** 直接暴露面板主机 IP,比如(ip-to-dashboard.nai.ba)。 ```shell curl -L https://raw.githubusercontent.com/naiba/nezha/master/script/install.sh -o nezha.sh && chmod +x nezha.sh @@ -36,6 +36,13 @@ CN=true sudo ./nezha.sh _\* 使用 WatchTower 可以自动更新面板,Windows 终端可以使用 nssm 配置自启动(见尾部教程)_ +### 特殊技能 + +编辑 `/etc/systemd/system/nezha-agent.service`,在 `ExecStart=` 这一行的末尾加上 + +- `--skip-conn` 不监控连接数,机场/连接密集型机器推荐设置,不然比较占CPU([shirou/gopsutil/issues#220](https://github.com/shirou/gopsutil/issues/220)) +- `--disable-auto-update` 禁止 Agent 自动更新 + ## 功能说明
@@ -63,7 +70,7 @@ URL 里面也可放置占位符,请求时会进行简单的字符串替换。 - server 酱示例 - 名称:server 酱 - - URL:https://sc.ftqq.com/SCUrandomkeys.send?text=#NEZHA# + - URL:# - 请求方式: GET - 请求类型: 默认 - Body: 空 @@ -71,7 +78,7 @@ URL 里面也可放置占位符,请求时会进行简单的字符串替换。 - wxpusher 示例,需要关注你的应用 - 名称: wxpusher - - URL:http://wxpusher.zjiecode.com/api/send/message + - URL: - 请求方式: POST - 请求类型: JSON - Body: `{"appToken":"你的appToken","topicIds":[],"content":"#NEZHA#","contentType":"1","uids":["你的uid"]}` @@ -79,7 +86,7 @@ URL 里面也可放置占位符,请求时会进行简单的字符串替换。 - telegram 示例 [@haitau](https://github.com/haitau) 贡献 - 名称:telegram 机器人消息通知 - - URL:https://api.telegram.org/botXXXXXX/sendMessage?chat_id=YYYYYY&text=#NEZHA# + - URL:# - 请求方式: GET - 请求类型: 默认 - Body: 空 @@ -130,6 +137,7 @@ URL 里面也可放置占位符,请求时会进行简单的字符串替换。 - cycle_interval 小时(可以设为 1 月,30\*24) - min/max、cover、ignore 参考基本规则配置 - 示例: 3 号机器的每月 15 号计费的出站月流量 1T 报警 `[{"type":"transfer_out_cycle","max":1000000000000,"cycle_start":"2021-07-15T08:00:00Z","cycle_interval":720,"cover":1,"ignore":{"3":true}}]` +
@@ -221,19 +229,13 @@ URL 里面也可放置占位符,请求时会进行简单的字符串替换。
-
- 如何禁用网络连接数监控? - -编辑 `/etc/systemd/system/nezha-agent.service`,在 `ExecStart=` 这一行的末尾加上 `--skip-conn` 就是不监控连接数 - -
-
Agent 不断重启/无法启动 ? 1. 直接执行 `/opt/nezha/agent/nezha-agent -s 面板IP或非CDN域名:面板RPC端口 -p Agent密钥 -d` 查看日志是否是 DNS 问题。 -2. `nc -v 域名/IP 面板RPC端口` 或者 `telnet 域名/IP 面板RPC端口` 检验是否是网络问题,检查本机与面板服务器出入站防火墙,如果单机无法判断可借助 https://port.ping.pe/ 提供的端口检查工具进行检测。 +2. `nc -v 域名/IP 面板RPC端口` 或者 `telnet 域名/IP 面板RPC端口` 检验是否是网络问题,检查本机与面板服务器出入站防火墙,如果单机无法判断可借助 提供的端口检查工具进行检测。 3. 如果上面步骤检测正常,Agent 正常上线,尝试关闭 SELinux,[如何关闭 SELinux?](https://www.google.com/search?q=%E5%85%B3%E9%97%ADSELINUX) +
@@ -248,10 +250,10 @@ START=99 USE_PROCD=1 start_service() { - procd_open_instance - procd_set_param command /root/nezha-agent -s 面板网址:接收端口 -p 唯一秘钥 -d - procd_set_param respawn - procd_close_instance + procd_open_instance + procd_set_param command /root/nezha-agent -s 面板网址:接收端口 -p 唯一秘钥 -d + procd_set_param respawn + procd_close_instance } stop_service() { @@ -259,9 +261,9 @@ stop_service() { } restart() { - stop - sleep 2 - start + stop + sleep 2 + start } ``` diff --git a/cmd/agent/main.go b/cmd/agent/main.go index 5a06199..a1a39db 100644 --- a/cmd/agent/main.go +++ b/cmd/agent/main.go @@ -6,7 +6,6 @@ import ( "errors" "fmt" "io" - "log" "net" "net/http" "os" @@ -29,15 +28,22 @@ import ( "github.com/naiba/nezha/service/rpc" ) +type AgentConfig struct { + SkipConnectionCount bool + DisableAutoUpdate bool + Debug bool + Server string + ClientSecret string +} + func init() { http.DefaultClient.Timeout = time.Second * 5 flag.CommandLine.ParseErrorsWhitelist.UnknownFlags = true } var ( - server, clientSecret, version string - debug bool - stateConf monitor.GetStateConfig + version string + agentConf AgentConfig ) var ( @@ -61,13 +67,14 @@ func main() { // 来自于 GoReleaser 的版本号 monitor.Version = version - flag.BoolVarP(&debug, "debug", "d", false, "开启调试信息") - flag.StringVarP(&server, "*server", "s", "localhost:5555", "管理面板RPC端口") - flag.StringVarP(&clientSecret, "password", "p", "", "Agent连接Secret") - flag.BoolVar(&stateConf.SkipConnectionCount, "skip-conn", false, "不监控连接数") + flag.BoolVarP(&agentConf.Debug, "debug", "d", false, "开启调试信息") + flag.StringVarP(&agentConf.Server, "*server", "s", "localhost:5555", "管理面板RPC端口") + flag.StringVarP(&agentConf.ClientSecret, "password", "p", "", "Agent连接Secret") + flag.BoolVar(&agentConf.SkipConnectionCount, "skip-conn", false, "不监控连接数") + flag.BoolVar(&agentConf.DisableAutoUpdate, "disable-auto-update", false, "禁用自动升级") flag.Parse() - if clientSecret == "" { + if agentConf.ClientSecret == "" { flag.Usage() return } @@ -77,7 +84,7 @@ func main() { func run() { auth := rpc.AuthHandler{ - ClientSecret: clientSecret, + ClientSecret: agentConf.ClientSecret, } go pty.DownloadDependency() @@ -87,7 +94,7 @@ func run() { // 更新IP信息 go monitor.UpdateIP() - if _, err := semver.Parse(version); err == nil { + if _, err := semver.Parse(version); err == nil && !agentConf.DisableAutoUpdate { go func() { for range updateCh { go doSelfUpdate() @@ -111,7 +118,7 @@ func run() { for { timeOutCtx, cancel := context.WithTimeout(context.Background(), networkTimeOut) - conn, err = grpc.DialContext(timeOutCtx, server, grpc.WithInsecure(), grpc.WithPerRPCCredentials(&auth)) + conn, err = grpc.DialContext(timeOutCtx, agentConf.Server, grpc.WithInsecure(), grpc.WithPerRPCCredentials(&auth)) if err != nil { println("与面板建立连接失败:", err) cancel() @@ -194,7 +201,7 @@ func reportState() { if client != nil && inited { monitor.TrackNetworkSpeed() timeOutCtx, cancel := context.WithTimeout(context.Background(), networkTimeOut) - _, err = client.ReportSystemState(timeOutCtx, monitor.GetState(stateConf).PB()) + _, err = client.ReportSystemState(timeOutCtx, monitor.GetState(agentConf.SkipConnectionCount).PB()) cancel() if err != nil { println("reportState error", err) @@ -334,7 +341,7 @@ func handleTerminalTask(task *pb.Task) { protocol += "s" } header := http.Header{} - header.Add("Secret", clientSecret) + header.Add("Secret", agentConf.ClientSecret) conn, _, err := websocket.DefaultDialer.Dial(fmt.Sprintf("%s://%s/terminal/%s", protocol, terminal.Host, terminal.Session), header) if err != nil { println("Terminal 连接失败:", err) @@ -404,7 +411,8 @@ func handleTerminalTask(task *pb.Task) { } func println(v ...interface{}) { - if debug { - log.Println(v...) + if agentConf.Debug { + fmt.Printf("NEZHA@%s>> ", time.Now().Format("2006-01-02 15:04:05")) + fmt.Println(v...) } } diff --git a/cmd/agent/monitor/monitor.go b/cmd/agent/monitor/monitor.go index 21a1242..d2febd5 100644 --- a/cmd/agent/monitor/monitor.go +++ b/cmd/agent/monitor/monitor.go @@ -74,11 +74,7 @@ func GetHost() *model.Host { } } -type GetStateConfig struct { - SkipConnectionCount bool -} - -func GetState(conf GetStateConfig) *model.HostState { +func GetState(skipConnectionCount bool) *model.HostState { hi, _ := host.Info() mv, _ := mem.VirtualMemory() @@ -101,7 +97,7 @@ func GetState(conf GetStateConfig) *model.HostState { var tcpConnCount, udpConnCount uint64 - if !conf.SkipConnectionCount { + if !skipConnectionCount { conns, _ := net.Connections("all") for i := 0; i < len(conns); i++ { switch conns[i].Type { diff --git a/cmd/dashboard/controller/common_page.go b/cmd/dashboard/controller/common_page.go index cad348b..6447d7e 100644 --- a/cmd/dashboard/controller/common_page.go +++ b/cmd/dashboard/controller/common_page.go @@ -271,8 +271,8 @@ func (cp *commonPage) terminal(c *gin.Context) { } defer conn.Close() - log.Println("terminal connected", isAgent, c.Request.URL) - defer log.Println("terminal disconnected", isAgent, c.Request.URL) + log.Println("NEZHA>> terminal connected", isAgent, c.Request.URL) + defer log.Println("NEZHA>> terminal disconnected", isAgent, c.Request.URL) if isAgent { terminal.agentConn = conn diff --git a/cmd/dashboard/main.go b/cmd/dashboard/main.go index 722a0bc..071d942 100644 --- a/cmd/dashboard/main.go +++ b/cmd/dashboard/main.go @@ -198,7 +198,7 @@ func main() { } } if err := dao.DB.Create(txs).Error; err != nil { - log.Println("流量统计入库", err) + log.Println("NEZHA>> 流量统计入库", err) } srv.Shutdown(c) return nil diff --git a/model/monitor.go b/model/monitor.go index f3e7cbd..abc9b8d 100644 --- a/model/monitor.go +++ b/model/monitor.go @@ -73,3 +73,7 @@ func (m *Monitor) AfterFind(tx *gorm.DB) error { } return nil } + +func IsServiceSentinelNeeded(t uint64) bool { + return t == TaskTypeCommand || t == TaskTypeTerminal +} diff --git a/service/dao/alertsentinel.go b/service/dao/alertsentinel.go index 2468931..cc87992 100644 --- a/service/dao/alertsentinel.go +++ b/service/dao/alertsentinel.go @@ -54,7 +54,7 @@ func AlertSentinelStart() { checkCount++ if lastPrint.Before(startedAt.Add(-1 * time.Hour)) { if Conf.Debug { - log.Println("报警规则检测每小时", checkCount, "次", startedAt, time.Now()) + log.Println("NEZHA>> 报警规则检测每小时", checkCount, "次", startedAt, time.Now()) } checkCount = 0 lastPrint = startedAt @@ -114,11 +114,11 @@ func checkStatus() { max, passed := alert.Check(alertsStore[alert.ID][server.ID]) if !passed { alertsPrevState[alert.ID][server.ID] = _RuleCheckFail - message := fmt.Sprintf("报警规则:%s,服务器:%s(%s),逮到咯,快去看看!", alert.Name, server.Name, utils.IPDesensitize(server.Host.IP)) + message := fmt.Sprintf("[主机故障] %s(%s) 规则:%s,", server.Name, utils.IPDesensitize(server.Host.IP), alert.Name) go SendNotification(message, true) } else { if alertsPrevState[alert.ID][server.ID] == _RuleCheckFail { - message := fmt.Sprintf("报警规则:%s,服务器:%s(%s),已恢复正常", alert.Name, server.Name, utils.IPDesensitize(server.Host.IP)) + message := fmt.Sprintf("[主机恢复] %s(%s) 规则:%s", server.Name, utils.IPDesensitize(server.Host.IP), alert.Name) go SendNotification(message, true) } alertsPrevState[alert.ID][server.ID] = _RuleCheckPass diff --git a/service/dao/dao.go b/service/dao/dao.go index f7f720f..dfcd791 100644 --- a/service/dao/dao.go +++ b/service/dao/dao.go @@ -13,7 +13,7 @@ import ( pb "github.com/naiba/nezha/proto" ) -var Version = "v0.9.34" // !!记得修改 README 中的 badge 版本!! +var Version = "v0.10.0" // !!记得修改 README 中的 badge 版本!! var ( Conf *model.Config @@ -64,7 +64,7 @@ func ManualTrigger(c *model.Cron) { Type: model.TaskTypeCommand, }) } else { - SendNotification(fmt.Sprintf("计划任务:%s,服务器:%s 离线,无法执行。", c.Name, ServerList[c.Servers[j]].Name), false) + SendNotification(fmt.Sprintf("[任务失败] %s,服务器 %s 离线,无法执行。", c.Name, ServerList[c.Servers[j]].Name), false) } } } @@ -91,7 +91,7 @@ func CronTrigger(cr model.Cron) func() { Type: model.TaskTypeCommand, }) } else { - SendNotification(fmt.Sprintf("计划任务:%s,服务器:%s 离线,无法执行。", cr.Name, s.Name), false) + SendNotification(fmt.Sprintf("[任务失败] %s,服务器 %s 离线,无法执行。", cr.Name, s.Name), false) } } } diff --git a/service/dao/notification.go b/service/dao/notification.go index 3120613..bc675de 100644 --- a/service/dao/notification.go +++ b/service/dao/notification.go @@ -71,7 +71,7 @@ func SendNotification(desc string, muteable bool) { if !flag { if Conf.Debug { - log.Println("静音的重复通知:", desc, muteable) + log.Println("NEZHA>> 静音的重复通知:", desc, muteable) } return } @@ -81,7 +81,7 @@ func SendNotification(desc string, muteable bool) { defer notificationsLock.RUnlock() for i := 0; i < len(notifications); i++ { if err := notifications[i].Send(desc); err != nil { - log.Println("发送通知失败:", err) + log.Println("NEZHA>> 发送通知失败:", err) } } } diff --git a/service/dao/servicesentinel.go b/service/dao/servicesentinel.go index e255ab8..56f3db5 100644 --- a/service/dao/servicesentinel.go +++ b/service/dao/servicesentinel.go @@ -150,6 +150,7 @@ func (ss *ServiceSentinel) loadMonitorHistory() { monitors[i].CronJobID, err = ss.dispatchCron.AddFunc(task.CronSpec(), func() { ss.dispatchBus <- task }) + log.Println("NEZHA>> 服务监控任务", monitors[i].ID, monitors[i].Name, monitors[i].CronJobID) if err != nil { panic(err) } @@ -282,7 +283,8 @@ func getStateStr(percent uint64) string { func (ss *ServiceSentinel) worker() { for r := range ss.serviceReportChannel { - if ss.monitors[r.Data.GetId()].ID == 0 { + if ss.monitors[r.Data.GetId()] == nil || ss.monitors[r.Data.GetId()].ID == 0 { + log.Printf("NEZAH>> 错误的服务监控上报 %+v", r) continue } mh := model.PB2MonitorHistory(r.Data) @@ -315,7 +317,7 @@ func (ss *ServiceSentinel) worker() { ss.serviceCurrentStatusIndex[mh.MonitorID] = 0 dataToSave := ss.serviceCurrentStatusData[mh.MonitorID] if err := DB.Create(&dataToSave).Error; err != nil { - log.Println("服务监控数据持久化失败:", err) + log.Println("NEZHA>> 服务监控数据持久化失败:", err) } } // 更新当前状态 @@ -337,7 +339,7 @@ func (ss *ServiceSentinel) worker() { stateStr := getStateStr(upPercent) if !mh.Successful { ServerLock.RLock() - log.Println("服务故障上报:", ss.monitors[mh.MonitorID].Target, stateStr, "上报者:", ServerList[r.Reporter].Name, "请求输出:", mh.Data) + log.Println("NEZHA>> 服务故障上报:", ss.monitors[mh.MonitorID].Target, stateStr, "上报者:", ServerList[r.Reporter].Name, "请求输出:", mh.Data) ServerLock.RUnlock() } if stateStr == "故障" || stateStr != ss.lastStatus[mh.MonitorID] { @@ -345,7 +347,7 @@ func (ss *ServiceSentinel) worker() { isNeedSendNotification := (ss.lastStatus[mh.MonitorID] != "" || stateStr == "故障") && ss.monitors[mh.MonitorID].Notify ss.lastStatus[mh.MonitorID] = stateStr if isNeedSendNotification { - go SendNotification(fmt.Sprintf("服务监控:%s 服务状态:%s", ss.monitors[mh.MonitorID].Name, stateStr), true) + go SendNotification(fmt.Sprintf("[服务%s] %s", stateStr, ss.monitors[mh.MonitorID].Name), true) } ss.monitorsLock.RUnlock() } @@ -389,7 +391,7 @@ func (ss *ServiceSentinel) worker() { if errMsg != "" { ss.monitorsLock.RLock() if ss.monitors[mh.MonitorID].Notify { - go SendNotification(fmt.Sprintf("服务监控:%s %s", ss.monitors[mh.MonitorID].Name, errMsg), true) + go SendNotification(fmt.Sprintf("[SSL] %s %s", ss.monitors[mh.MonitorID].Name, errMsg), true) } ss.monitorsLock.RUnlock() } diff --git a/service/rpc/nezha.go b/service/rpc/nezha.go index 8dcc02f..9ec4f2f 100644 --- a/service/rpc/nezha.go +++ b/service/rpc/nezha.go @@ -21,12 +21,7 @@ func (s *NezhaHandler) ReportTask(c context.Context, r *pb.TaskResult) (*pb.Rece if clientID, err = s.Auth.Check(c); err != nil { return nil, err } - if r.GetType() != model.TaskTypeCommand { - dao.ServiceSentinelShared.Dispatch(dao.ReportData{ - Data: r, - Reporter: clientID, - }) - } else { + if r.GetType() == model.TaskTypeCommand { // 处理上报的计划任务 dao.CronLock.RLock() defer dao.CronLock.RUnlock() @@ -35,16 +30,21 @@ func (s *NezhaHandler) ReportTask(c context.Context, r *pb.TaskResult) (*pb.Rece dao.ServerLock.RLock() defer dao.ServerLock.RUnlock() if cr.PushSuccessful && r.GetSuccessful() { - dao.SendNotification(fmt.Sprintf("成功计划任务:%s ,服务器:%s,日志:\n%s", cr.Name, dao.ServerList[clientID].Name, r.GetData()), false) + dao.SendNotification(fmt.Sprintf("[任务成功] %s ,服务器:%s,日志:\n%s", cr.Name, dao.ServerList[clientID].Name, r.GetData()), false) } if !r.GetSuccessful() { - dao.SendNotification(fmt.Sprintf("失败计划任务:%s ,服务器:%s,日志:\n%s", cr.Name, dao.ServerList[clientID].Name, r.GetData()), false) + dao.SendNotification(fmt.Sprintf("[任务失败] %s ,服务器:%s,日志:\n%s", cr.Name, dao.ServerList[clientID].Name, r.GetData()), false) } dao.DB.Model(cr).Updates(model.Cron{ LastExecutedAt: time.Now().Add(time.Second * -1 * time.Duration(r.GetDelay())), LastResult: r.GetSuccessful(), }) } + } else if model.IsServiceSentinelNeeded(r.GetType()) { + dao.ServiceSentinelShared.Dispatch(dao.ReportData{ + Data: r, + Reporter: clientID, + }) } return &pb.Receipt{Proced: true}, nil } @@ -101,7 +101,7 @@ func (s *NezhaHandler) ReportSystemInfo(c context.Context, r *pb.Host) (*pb.Rece host.IP != "" && dao.ServerList[clientID].Host.IP != host.IP { dao.SendNotification(fmt.Sprintf( - "IP变更提醒 服务器:%s ,旧IP:%s,新IP:%s。", + "[IP变更] %s ,旧IP:%s,新IP:%s。", dao.ServerList[clientID].Name, utils.IPDesensitize(dao.ServerList[clientID].Host.IP), utils.IPDesensitize(host.IP)), true) }