🚸 对 Agent 的信息上报增加超时(可能)增加在 Windows 上的稳定性

This commit is contained in:
naiba 2021-06-12 11:37:47 +08:00
parent 6999accb3a
commit 373b95a144
2 changed files with 20 additions and 12 deletions

View File

@ -8,8 +8,7 @@
\>> 交流论坛:[打杂社区](https://daza.net/c/nezha) (Lemmy) \>> 交流论坛:[打杂社区](https://daza.net/c/nezha) (Lemmy)
\>> QQ 交流群872069346 **加群要求:已搭建好哪吒监控 & 有 2+ 服务器**<br> \>> QQ 交流群872069346 **加群要求:已搭建好哪吒监控 & 有 2+ 服务器**
群友互助/服务器交流,作者不答疑,找 naiba 请至论坛发帖
\>> [我们的用户](https://www.google.com/search?q="powered+by+哪吒监控%7C哪吒面板"&filter=0) (Google) \>> [我们的用户](https://www.google.com/search?q="powered+by+哪吒监控%7C哪吒面板"&filter=0) (Google)
@ -103,7 +102,7 @@ URL 里面也可放置占位符,请求时会进行简单的字符串替换。
- net_in_speed(入站网速)、net_out_speed(出站网速)、net_all_speed(双向网速)、transfer_in(入站流量)、transfer_out(出站流量)、transfer_all(双向流量)Min/Max 数值为字节1kb=10241mb = 1024\*1024 - net_in_speed(入站网速)、net_out_speed(出站网速)、net_all_speed(双向网速)、transfer_in(入站流量)、transfer_out(出站流量)、transfer_all(双向流量)Min/Max 数值为字节1kb=10241mb = 1024\*1024
- offline不支持 Min/Max 参数 - offline不支持 Min/Max 参数
- Duration持续秒数监控比较简陋取持续时间内的 70% 采样结果 - Duration持续秒数监控比较简陋取持续时间内的 70% 采样结果
- Ignore: `{"1": true, "2":false}` 忽略此规则的服务器 ID 列表 - Ignore: `{"1": true, "2":false}` 忽略此规则的服务器 ID 列表,比如忽略服务器 ID 5 的离线通知 `[{"Type":"offline","Duration":10, "Ignore":{"5": true}}]`
</details> </details>
<details> <details>
@ -198,7 +197,7 @@ URL 里面也可放置占位符,请求时会进行简单的字符串替换。
<details> <details>
<summary>如何使 OpenWrt/LEDE 自启动?来自 @艾斯德斯</summary> <summary>如何使 OpenWrt/LEDE 自启动?来自 @艾斯德斯</summary>
首先在 release 下载对应的二进制解压tar.gz包后放置到 `/root`,然后 `chmod +x /root/nezha-agent` 赋予执行权限,然后创建 `/etc/init.d/nezha-service` 首先在 release 下载对应的二进制解压 tar.gz 包后放置到 `/root`,然后 `chmod +x /root/nezha-agent` 赋予执行权限,然后创建 `/etc/init.d/nezha-service`
``` ```
#!/bin/sh /etc/rc.common #!/bin/sh /etc/rc.common
@ -251,6 +250,7 @@ restart() {
```nginx ```nginx
server{ server{
#原有的一些配置
#server_name blablabla... #server_name blablabla...
location /ws { location /ws {

View File

@ -41,7 +41,6 @@ var (
var ( var (
client pb.NezhaServiceClient client pb.NezhaServiceClient
ctx = context.Background()
updateCh = make(chan struct{}) // Agent 自动更新间隔 updateCh = make(chan struct{}) // Agent 自动更新间隔
httpClient = &http.Client{ httpClient = &http.Client{
Transport: &http.Transport{ Transport: &http.Transport{
@ -56,6 +55,7 @@ var (
const ( const (
delayWhenError = time.Second * 10 // Agent 重连间隔 delayWhenError = time.Second * 10 // Agent 重连间隔
networkTimeOut = time.Second * 5 // 普通网络超时
) )
func main() { func main() {
@ -113,7 +113,7 @@ func run() {
} }
for { for {
timeOutCtx, cancel := context.WithTimeout(ctx, time.Second*5) timeOutCtx, cancel := context.WithTimeout(context.Background(), networkTimeOut)
conn, err = grpc.DialContext(timeOutCtx, server, grpc.WithInsecure(), grpc.WithPerRPCCredentials(&auth)) conn, err = grpc.DialContext(timeOutCtx, server, grpc.WithInsecure(), grpc.WithPerRPCCredentials(&auth))
if err != nil { if err != nil {
println("grpc.Dial err: ", err) println("grpc.Dial err: ", err)
@ -124,19 +124,25 @@ func run() {
cancel() cancel()
client = pb.NewNezhaServiceClient(conn) client = pb.NewNezhaServiceClient(conn)
// 第一步注册 // 第一步注册
_, err = client.ReportSystemInfo(ctx, monitor.GetHost().PB()) timeOutCtx, cancel = context.WithTimeout(context.Background(), networkTimeOut)
_, err = client.ReportSystemInfo(timeOutCtx, monitor.GetHost().PB())
if err != nil { if err != nil {
println("client.ReportSystemInfo err: ", err) println("client.ReportSystemInfo err: ", err)
cancel()
retry() retry()
continue continue
} }
cancel()
// 执行 Task // 执行 Task
tasks, err := client.RequestTask(ctx, monitor.GetHost().PB()) timeOutCtx, cancel = context.WithTimeout(context.Background(), networkTimeOut)
tasks, err := client.RequestTask(timeOutCtx, monitor.GetHost().PB())
if err != nil { if err != nil {
println("client.RequestTask err: ", err) println("client.RequestTask err: ", err)
cancel()
retry() retry()
continue continue
} }
cancel()
err = receiveTasks(tasks) err = receiveTasks(tasks)
println("receiveTasks exit to main: ", err) println("receiveTasks exit to main: ", err)
retry() retry()
@ -226,7 +232,7 @@ func doTask(task *pb.Task) {
if err != nil { if err != nil {
// 进程组创建失败,直接退出 // 进程组创建失败,直接退出
result.Data = err.Error() result.Data = err.Error()
client.ReportTask(ctx, &result) client.ReportTask(context.Background(), &result)
return return
} }
timeout := time.NewTimer(time.Hour * 2) timeout := time.NewTimer(time.Hour * 2)
@ -258,7 +264,7 @@ func doTask(task *pb.Task) {
default: default:
println("Unknown action: ", task) println("Unknown action: ", task)
} }
client.ReportTask(ctx, &result) client.ReportTask(context.Background(), &result)
} }
func reportState() { func reportState() {
@ -268,14 +274,16 @@ func reportState() {
for { for {
if client != nil { if client != nil {
monitor.TrackNetworkSpeed() monitor.TrackNetworkSpeed()
_, err = client.ReportSystemState(ctx, monitor.GetState(dao.ReportDelay).PB()) timeOutCtx, cancel := context.WithTimeout(context.Background(), networkTimeOut)
_, err = client.ReportSystemState(timeOutCtx, monitor.GetState(dao.ReportDelay).PB())
cancel()
if err != nil { if err != nil {
println("reportState error", err) println("reportState error", err)
time.Sleep(delayWhenError) time.Sleep(delayWhenError)
} }
if lastReportHostInfo.Before(time.Now().Add(-10 * time.Minute)) { if lastReportHostInfo.Before(time.Now().Add(-10 * time.Minute)) {
lastReportHostInfo = time.Now() lastReportHostInfo = time.Now()
client.ReportSystemInfo(ctx, monitor.GetHost().PB()) client.ReportSystemInfo(context.Background(), monitor.GetHost().PB())
} }
} }
} }