diff --git a/README.md b/README.md
index 7bbd64e..99b643c 100644
--- a/README.md
+++ b/README.md
@@ -1,13 +1,13 @@
-
+
\>> QQ 交流群:872069346 **加群要求:已搭建好哪吒监控 & 有 2+ 服务器**
@@ -20,7 +20,7 @@
## 安装脚本
-**推荐配置:** 安装前准备 _两个域名_,一个可以 **接入 CDN** 作为 _公开访问_,比如 (status.nai.ba);另外一个解析到面板服务器作为 Agent 连接 Dashboard 使用,**不能接入 CDN** 直接暴露面板主机 IP,比如(randomdashboard.nai.ba)。
+**推荐配置:** 安装前准备 _两个域名_,一个可以 **接入 CDN** 作为 _公开访问_,比如 (status.nai.ba);另外一个解析到面板服务器作为 Agent 连接 Dashboard 使用,**不能接入 CDN** 直接暴露面板主机 IP,比如(ip-to-dashboard.nai.ba)。
```shell
curl -L https://raw.githubusercontent.com/naiba/nezha/master/script/install.sh -o nezha.sh && chmod +x nezha.sh
@@ -36,6 +36,13 @@ CN=true sudo ./nezha.sh
_\* 使用 WatchTower 可以自动更新面板,Windows 终端可以使用 nssm 配置自启动(见尾部教程)_
+### 特殊技能
+
+编辑 `/etc/systemd/system/nezha-agent.service`,在 `ExecStart=` 这一行的末尾加上
+
+- `--skip-conn` 不监控连接数,机场/连接密集型机器推荐设置,不然比较占CPU([shirou/gopsutil/issues#220](https://github.com/shirou/gopsutil/issues/220))
+- `--disable-auto-update` 禁止 Agent 自动更新
+
## 功能说明
@@ -63,7 +70,7 @@ URL 里面也可放置占位符,请求时会进行简单的字符串替换。
- server 酱示例
- 名称:server 酱
- - URL:https://sc.ftqq.com/SCUrandomkeys.send?text=#NEZHA#
+ - URL:#
- 请求方式: GET
- 请求类型: 默认
- Body: 空
@@ -71,7 +78,7 @@ URL 里面也可放置占位符,请求时会进行简单的字符串替换。
- wxpusher 示例,需要关注你的应用
- 名称: wxpusher
- - URL:http://wxpusher.zjiecode.com/api/send/message
+ - URL:
- 请求方式: POST
- 请求类型: JSON
- Body: `{"appToken":"你的appToken","topicIds":[],"content":"#NEZHA#","contentType":"1","uids":["你的uid"]}`
@@ -79,7 +86,7 @@ URL 里面也可放置占位符,请求时会进行简单的字符串替换。
- telegram 示例 [@haitau](https://github.com/haitau) 贡献
- 名称:telegram 机器人消息通知
- - URL:https://api.telegram.org/botXXXXXX/sendMessage?chat_id=YYYYYY&text=#NEZHA#
+ - URL:#
- 请求方式: GET
- 请求类型: 默认
- Body: 空
@@ -130,6 +137,7 @@ URL 里面也可放置占位符,请求时会进行简单的字符串替换。
- cycle_interval 小时(可以设为 1 月,30\*24)
- min/max、cover、ignore 参考基本规则配置
- 示例: 3 号机器的每月 15 号计费的出站月流量 1T 报警 `[{"type":"transfer_out_cycle","max":1000000000000,"cycle_start":"2021-07-15T08:00:00Z","cycle_interval":720,"cover":1,"ignore":{"3":true}}]`
+
@@ -221,19 +229,13 @@ URL 里面也可放置占位符,请求时会进行简单的字符串替换。
-
- 如何禁用网络连接数监控?
-
-编辑 `/etc/systemd/system/nezha-agent.service`,在 `ExecStart=` 这一行的末尾加上 `--skip-conn` 就是不监控连接数
-
-
-
Agent 不断重启/无法启动 ?
1. 直接执行 `/opt/nezha/agent/nezha-agent -s 面板IP或非CDN域名:面板RPC端口 -p Agent密钥 -d` 查看日志是否是 DNS 问题。
-2. `nc -v 域名/IP 面板RPC端口` 或者 `telnet 域名/IP 面板RPC端口` 检验是否是网络问题,检查本机与面板服务器出入站防火墙,如果单机无法判断可借助 https://port.ping.pe/ 提供的端口检查工具进行检测。
+2. `nc -v 域名/IP 面板RPC端口` 或者 `telnet 域名/IP 面板RPC端口` 检验是否是网络问题,检查本机与面板服务器出入站防火墙,如果单机无法判断可借助 提供的端口检查工具进行检测。
3. 如果上面步骤检测正常,Agent 正常上线,尝试关闭 SELinux,[如何关闭 SELinux?](https://www.google.com/search?q=%E5%85%B3%E9%97%ADSELINUX)
+
@@ -248,10 +250,10 @@ START=99
USE_PROCD=1
start_service() {
- procd_open_instance
- procd_set_param command /root/nezha-agent -s 面板网址:接收端口 -p 唯一秘钥 -d
- procd_set_param respawn
- procd_close_instance
+ procd_open_instance
+ procd_set_param command /root/nezha-agent -s 面板网址:接收端口 -p 唯一秘钥 -d
+ procd_set_param respawn
+ procd_close_instance
}
stop_service() {
@@ -259,9 +261,9 @@ stop_service() {
}
restart() {
- stop
- sleep 2
- start
+ stop
+ sleep 2
+ start
}
```
diff --git a/cmd/agent/main.go b/cmd/agent/main.go
index 5a06199..a1a39db 100644
--- a/cmd/agent/main.go
+++ b/cmd/agent/main.go
@@ -6,7 +6,6 @@ import (
"errors"
"fmt"
"io"
- "log"
"net"
"net/http"
"os"
@@ -29,15 +28,22 @@ import (
"github.com/naiba/nezha/service/rpc"
)
+type AgentConfig struct {
+ SkipConnectionCount bool
+ DisableAutoUpdate bool
+ Debug bool
+ Server string
+ ClientSecret string
+}
+
func init() {
http.DefaultClient.Timeout = time.Second * 5
flag.CommandLine.ParseErrorsWhitelist.UnknownFlags = true
}
var (
- server, clientSecret, version string
- debug bool
- stateConf monitor.GetStateConfig
+ version string
+ agentConf AgentConfig
)
var (
@@ -61,13 +67,14 @@ func main() {
// 来自于 GoReleaser 的版本号
monitor.Version = version
- flag.BoolVarP(&debug, "debug", "d", false, "开启调试信息")
- flag.StringVarP(&server, "*server", "s", "localhost:5555", "管理面板RPC端口")
- flag.StringVarP(&clientSecret, "password", "p", "", "Agent连接Secret")
- flag.BoolVar(&stateConf.SkipConnectionCount, "skip-conn", false, "不监控连接数")
+ flag.BoolVarP(&agentConf.Debug, "debug", "d", false, "开启调试信息")
+ flag.StringVarP(&agentConf.Server, "*server", "s", "localhost:5555", "管理面板RPC端口")
+ flag.StringVarP(&agentConf.ClientSecret, "password", "p", "", "Agent连接Secret")
+ flag.BoolVar(&agentConf.SkipConnectionCount, "skip-conn", false, "不监控连接数")
+ flag.BoolVar(&agentConf.DisableAutoUpdate, "disable-auto-update", false, "禁用自动升级")
flag.Parse()
- if clientSecret == "" {
+ if agentConf.ClientSecret == "" {
flag.Usage()
return
}
@@ -77,7 +84,7 @@ func main() {
func run() {
auth := rpc.AuthHandler{
- ClientSecret: clientSecret,
+ ClientSecret: agentConf.ClientSecret,
}
go pty.DownloadDependency()
@@ -87,7 +94,7 @@ func run() {
// 更新IP信息
go monitor.UpdateIP()
- if _, err := semver.Parse(version); err == nil {
+ if _, err := semver.Parse(version); err == nil && !agentConf.DisableAutoUpdate {
go func() {
for range updateCh {
go doSelfUpdate()
@@ -111,7 +118,7 @@ func run() {
for {
timeOutCtx, cancel := context.WithTimeout(context.Background(), networkTimeOut)
- conn, err = grpc.DialContext(timeOutCtx, server, grpc.WithInsecure(), grpc.WithPerRPCCredentials(&auth))
+ conn, err = grpc.DialContext(timeOutCtx, agentConf.Server, grpc.WithInsecure(), grpc.WithPerRPCCredentials(&auth))
if err != nil {
println("与面板建立连接失败:", err)
cancel()
@@ -194,7 +201,7 @@ func reportState() {
if client != nil && inited {
monitor.TrackNetworkSpeed()
timeOutCtx, cancel := context.WithTimeout(context.Background(), networkTimeOut)
- _, err = client.ReportSystemState(timeOutCtx, monitor.GetState(stateConf).PB())
+ _, err = client.ReportSystemState(timeOutCtx, monitor.GetState(agentConf.SkipConnectionCount).PB())
cancel()
if err != nil {
println("reportState error", err)
@@ -334,7 +341,7 @@ func handleTerminalTask(task *pb.Task) {
protocol += "s"
}
header := http.Header{}
- header.Add("Secret", clientSecret)
+ header.Add("Secret", agentConf.ClientSecret)
conn, _, err := websocket.DefaultDialer.Dial(fmt.Sprintf("%s://%s/terminal/%s", protocol, terminal.Host, terminal.Session), header)
if err != nil {
println("Terminal 连接失败:", err)
@@ -404,7 +411,8 @@ func handleTerminalTask(task *pb.Task) {
}
func println(v ...interface{}) {
- if debug {
- log.Println(v...)
+ if agentConf.Debug {
+ fmt.Printf("NEZHA@%s>> ", time.Now().Format("2006-01-02 15:04:05"))
+ fmt.Println(v...)
}
}
diff --git a/cmd/agent/monitor/monitor.go b/cmd/agent/monitor/monitor.go
index 21a1242..d2febd5 100644
--- a/cmd/agent/monitor/monitor.go
+++ b/cmd/agent/monitor/monitor.go
@@ -74,11 +74,7 @@ func GetHost() *model.Host {
}
}
-type GetStateConfig struct {
- SkipConnectionCount bool
-}
-
-func GetState(conf GetStateConfig) *model.HostState {
+func GetState(skipConnectionCount bool) *model.HostState {
hi, _ := host.Info()
mv, _ := mem.VirtualMemory()
@@ -101,7 +97,7 @@ func GetState(conf GetStateConfig) *model.HostState {
var tcpConnCount, udpConnCount uint64
- if !conf.SkipConnectionCount {
+ if !skipConnectionCount {
conns, _ := net.Connections("all")
for i := 0; i < len(conns); i++ {
switch conns[i].Type {
diff --git a/cmd/dashboard/controller/common_page.go b/cmd/dashboard/controller/common_page.go
index cad348b..6447d7e 100644
--- a/cmd/dashboard/controller/common_page.go
+++ b/cmd/dashboard/controller/common_page.go
@@ -271,8 +271,8 @@ func (cp *commonPage) terminal(c *gin.Context) {
}
defer conn.Close()
- log.Println("terminal connected", isAgent, c.Request.URL)
- defer log.Println("terminal disconnected", isAgent, c.Request.URL)
+ log.Println("NEZHA>> terminal connected", isAgent, c.Request.URL)
+ defer log.Println("NEZHA>> terminal disconnected", isAgent, c.Request.URL)
if isAgent {
terminal.agentConn = conn
diff --git a/cmd/dashboard/main.go b/cmd/dashboard/main.go
index 722a0bc..071d942 100644
--- a/cmd/dashboard/main.go
+++ b/cmd/dashboard/main.go
@@ -198,7 +198,7 @@ func main() {
}
}
if err := dao.DB.Create(txs).Error; err != nil {
- log.Println("流量统计入库", err)
+ log.Println("NEZHA>> 流量统计入库", err)
}
srv.Shutdown(c)
return nil
diff --git a/model/monitor.go b/model/monitor.go
index f3e7cbd..abc9b8d 100644
--- a/model/monitor.go
+++ b/model/monitor.go
@@ -73,3 +73,7 @@ func (m *Monitor) AfterFind(tx *gorm.DB) error {
}
return nil
}
+
+func IsServiceSentinelNeeded(t uint64) bool {
+ return t == TaskTypeCommand || t == TaskTypeTerminal
+}
diff --git a/service/dao/alertsentinel.go b/service/dao/alertsentinel.go
index 2468931..cc87992 100644
--- a/service/dao/alertsentinel.go
+++ b/service/dao/alertsentinel.go
@@ -54,7 +54,7 @@ func AlertSentinelStart() {
checkCount++
if lastPrint.Before(startedAt.Add(-1 * time.Hour)) {
if Conf.Debug {
- log.Println("报警规则检测每小时", checkCount, "次", startedAt, time.Now())
+ log.Println("NEZHA>> 报警规则检测每小时", checkCount, "次", startedAt, time.Now())
}
checkCount = 0
lastPrint = startedAt
@@ -114,11 +114,11 @@ func checkStatus() {
max, passed := alert.Check(alertsStore[alert.ID][server.ID])
if !passed {
alertsPrevState[alert.ID][server.ID] = _RuleCheckFail
- message := fmt.Sprintf("报警规则:%s,服务器:%s(%s),逮到咯,快去看看!", alert.Name, server.Name, utils.IPDesensitize(server.Host.IP))
+ message := fmt.Sprintf("[主机故障] %s(%s) 规则:%s,", server.Name, utils.IPDesensitize(server.Host.IP), alert.Name)
go SendNotification(message, true)
} else {
if alertsPrevState[alert.ID][server.ID] == _RuleCheckFail {
- message := fmt.Sprintf("报警规则:%s,服务器:%s(%s),已恢复正常", alert.Name, server.Name, utils.IPDesensitize(server.Host.IP))
+ message := fmt.Sprintf("[主机恢复] %s(%s) 规则:%s", server.Name, utils.IPDesensitize(server.Host.IP), alert.Name)
go SendNotification(message, true)
}
alertsPrevState[alert.ID][server.ID] = _RuleCheckPass
diff --git a/service/dao/dao.go b/service/dao/dao.go
index f7f720f..dfcd791 100644
--- a/service/dao/dao.go
+++ b/service/dao/dao.go
@@ -13,7 +13,7 @@ import (
pb "github.com/naiba/nezha/proto"
)
-var Version = "v0.9.34" // !!记得修改 README 中的 badge 版本!!
+var Version = "v0.10.0" // !!记得修改 README 中的 badge 版本!!
var (
Conf *model.Config
@@ -64,7 +64,7 @@ func ManualTrigger(c *model.Cron) {
Type: model.TaskTypeCommand,
})
} else {
- SendNotification(fmt.Sprintf("计划任务:%s,服务器:%s 离线,无法执行。", c.Name, ServerList[c.Servers[j]].Name), false)
+ SendNotification(fmt.Sprintf("[任务失败] %s,服务器 %s 离线,无法执行。", c.Name, ServerList[c.Servers[j]].Name), false)
}
}
}
@@ -91,7 +91,7 @@ func CronTrigger(cr model.Cron) func() {
Type: model.TaskTypeCommand,
})
} else {
- SendNotification(fmt.Sprintf("计划任务:%s,服务器:%s 离线,无法执行。", cr.Name, s.Name), false)
+ SendNotification(fmt.Sprintf("[任务失败] %s,服务器 %s 离线,无法执行。", cr.Name, s.Name), false)
}
}
}
diff --git a/service/dao/notification.go b/service/dao/notification.go
index 3120613..bc675de 100644
--- a/service/dao/notification.go
+++ b/service/dao/notification.go
@@ -71,7 +71,7 @@ func SendNotification(desc string, muteable bool) {
if !flag {
if Conf.Debug {
- log.Println("静音的重复通知:", desc, muteable)
+ log.Println("NEZHA>> 静音的重复通知:", desc, muteable)
}
return
}
@@ -81,7 +81,7 @@ func SendNotification(desc string, muteable bool) {
defer notificationsLock.RUnlock()
for i := 0; i < len(notifications); i++ {
if err := notifications[i].Send(desc); err != nil {
- log.Println("发送通知失败:", err)
+ log.Println("NEZHA>> 发送通知失败:", err)
}
}
}
diff --git a/service/dao/servicesentinel.go b/service/dao/servicesentinel.go
index e255ab8..56f3db5 100644
--- a/service/dao/servicesentinel.go
+++ b/service/dao/servicesentinel.go
@@ -150,6 +150,7 @@ func (ss *ServiceSentinel) loadMonitorHistory() {
monitors[i].CronJobID, err = ss.dispatchCron.AddFunc(task.CronSpec(), func() {
ss.dispatchBus <- task
})
+ log.Println("NEZHA>> 服务监控任务", monitors[i].ID, monitors[i].Name, monitors[i].CronJobID)
if err != nil {
panic(err)
}
@@ -282,7 +283,8 @@ func getStateStr(percent uint64) string {
func (ss *ServiceSentinel) worker() {
for r := range ss.serviceReportChannel {
- if ss.monitors[r.Data.GetId()].ID == 0 {
+ if ss.monitors[r.Data.GetId()] == nil || ss.monitors[r.Data.GetId()].ID == 0 {
+ log.Printf("NEZAH>> 错误的服务监控上报 %+v", r)
continue
}
mh := model.PB2MonitorHistory(r.Data)
@@ -315,7 +317,7 @@ func (ss *ServiceSentinel) worker() {
ss.serviceCurrentStatusIndex[mh.MonitorID] = 0
dataToSave := ss.serviceCurrentStatusData[mh.MonitorID]
if err := DB.Create(&dataToSave).Error; err != nil {
- log.Println("服务监控数据持久化失败:", err)
+ log.Println("NEZHA>> 服务监控数据持久化失败:", err)
}
}
// 更新当前状态
@@ -337,7 +339,7 @@ func (ss *ServiceSentinel) worker() {
stateStr := getStateStr(upPercent)
if !mh.Successful {
ServerLock.RLock()
- log.Println("服务故障上报:", ss.monitors[mh.MonitorID].Target, stateStr, "上报者:", ServerList[r.Reporter].Name, "请求输出:", mh.Data)
+ log.Println("NEZHA>> 服务故障上报:", ss.monitors[mh.MonitorID].Target, stateStr, "上报者:", ServerList[r.Reporter].Name, "请求输出:", mh.Data)
ServerLock.RUnlock()
}
if stateStr == "故障" || stateStr != ss.lastStatus[mh.MonitorID] {
@@ -345,7 +347,7 @@ func (ss *ServiceSentinel) worker() {
isNeedSendNotification := (ss.lastStatus[mh.MonitorID] != "" || stateStr == "故障") && ss.monitors[mh.MonitorID].Notify
ss.lastStatus[mh.MonitorID] = stateStr
if isNeedSendNotification {
- go SendNotification(fmt.Sprintf("服务监控:%s 服务状态:%s", ss.monitors[mh.MonitorID].Name, stateStr), true)
+ go SendNotification(fmt.Sprintf("[服务%s] %s", stateStr, ss.monitors[mh.MonitorID].Name), true)
}
ss.monitorsLock.RUnlock()
}
@@ -389,7 +391,7 @@ func (ss *ServiceSentinel) worker() {
if errMsg != "" {
ss.monitorsLock.RLock()
if ss.monitors[mh.MonitorID].Notify {
- go SendNotification(fmt.Sprintf("服务监控:%s %s", ss.monitors[mh.MonitorID].Name, errMsg), true)
+ go SendNotification(fmt.Sprintf("[SSL] %s %s", ss.monitors[mh.MonitorID].Name, errMsg), true)
}
ss.monitorsLock.RUnlock()
}
diff --git a/service/rpc/nezha.go b/service/rpc/nezha.go
index 8dcc02f..9ec4f2f 100644
--- a/service/rpc/nezha.go
+++ b/service/rpc/nezha.go
@@ -21,12 +21,7 @@ func (s *NezhaHandler) ReportTask(c context.Context, r *pb.TaskResult) (*pb.Rece
if clientID, err = s.Auth.Check(c); err != nil {
return nil, err
}
- if r.GetType() != model.TaskTypeCommand {
- dao.ServiceSentinelShared.Dispatch(dao.ReportData{
- Data: r,
- Reporter: clientID,
- })
- } else {
+ if r.GetType() == model.TaskTypeCommand {
// 处理上报的计划任务
dao.CronLock.RLock()
defer dao.CronLock.RUnlock()
@@ -35,16 +30,21 @@ func (s *NezhaHandler) ReportTask(c context.Context, r *pb.TaskResult) (*pb.Rece
dao.ServerLock.RLock()
defer dao.ServerLock.RUnlock()
if cr.PushSuccessful && r.GetSuccessful() {
- dao.SendNotification(fmt.Sprintf("成功计划任务:%s ,服务器:%s,日志:\n%s", cr.Name, dao.ServerList[clientID].Name, r.GetData()), false)
+ dao.SendNotification(fmt.Sprintf("[任务成功] %s ,服务器:%s,日志:\n%s", cr.Name, dao.ServerList[clientID].Name, r.GetData()), false)
}
if !r.GetSuccessful() {
- dao.SendNotification(fmt.Sprintf("失败计划任务:%s ,服务器:%s,日志:\n%s", cr.Name, dao.ServerList[clientID].Name, r.GetData()), false)
+ dao.SendNotification(fmt.Sprintf("[任务失败] %s ,服务器:%s,日志:\n%s", cr.Name, dao.ServerList[clientID].Name, r.GetData()), false)
}
dao.DB.Model(cr).Updates(model.Cron{
LastExecutedAt: time.Now().Add(time.Second * -1 * time.Duration(r.GetDelay())),
LastResult: r.GetSuccessful(),
})
}
+ } else if model.IsServiceSentinelNeeded(r.GetType()) {
+ dao.ServiceSentinelShared.Dispatch(dao.ReportData{
+ Data: r,
+ Reporter: clientID,
+ })
}
return &pb.Receipt{Proced: true}, nil
}
@@ -101,7 +101,7 @@ func (s *NezhaHandler) ReportSystemInfo(c context.Context, r *pb.Host) (*pb.Rece
host.IP != "" &&
dao.ServerList[clientID].Host.IP != host.IP {
dao.SendNotification(fmt.Sprintf(
- "IP变更提醒 服务器:%s ,旧IP:%s,新IP:%s。",
+ "[IP变更] %s ,旧IP:%s,新IP:%s。",
dao.ServerList[clientID].Name, utils.IPDesensitize(dao.ServerList[clientID].Host.IP), utils.IPDesensitize(host.IP)), true)
}