Restart running monitors if no heartbeat (#3952)
This commit is contained in:
parent
9f170a68d7
commit
c43223a16d
|
@ -3,7 +3,7 @@ const dayjs = require("dayjs");
|
|||
const axios = require("axios");
|
||||
const { Prometheus } = require("../prometheus");
|
||||
const { log, UP, DOWN, PENDING, MAINTENANCE, flipStatus, TimeLogger, MAX_INTERVAL_SECOND, MIN_INTERVAL_SECOND,
|
||||
SQL_DATETIME_FORMAT
|
||||
SQL_DATETIME_FORMAT, isDev, sleep, getRandomInt
|
||||
} = require("../../src/util");
|
||||
const { tcping, ping, dnsResolve, checkCertificate, checkStatusCode, getTotalClientInRoom, setting, mssqlQuery, postgresQuery, mysqlQuery, mqttAsync, setSetting, httpNtlm, radius, grpcQuery,
|
||||
redisPingAsync, mongodbPing, kafkaProducerAsync, getOidcTokenClientCredentials, rootCertificatesFingerprints
|
||||
|
@ -328,6 +328,16 @@ class Monitor extends BeanModel {
|
|||
}
|
||||
}
|
||||
|
||||
// Evil
|
||||
if (isDev) {
|
||||
if (process.env.EVIL_RANDOM_MONITOR_SLEEP === "SURE") {
|
||||
if (getRandomInt(0, 100) === 0) {
|
||||
log.debug("evil", `[${this.name}] Evil mode: Random sleep: ` + beatInterval * 10000);
|
||||
await sleep(beatInterval * 10000);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Expose here for prometheus update
|
||||
// undefined if not https
|
||||
let tlsInfo = undefined;
|
||||
|
@ -995,6 +1005,7 @@ class Monitor extends BeanModel {
|
|||
if (! this.isStop) {
|
||||
log.debug("monitor", `[${this.name}] SetTimeout for next check.`);
|
||||
this.heartbeatInterval = setTimeout(safeBeat, beatInterval * 1000);
|
||||
this.lastScheduleBeatTime = dayjs();
|
||||
} else {
|
||||
log.info("monitor", `[${this.name}] isStop = true, no next check.`);
|
||||
}
|
||||
|
@ -1004,7 +1015,9 @@ class Monitor extends BeanModel {
|
|||
/** Get a heartbeat and handle errors */
|
||||
const safeBeat = async () => {
|
||||
try {
|
||||
this.lastStartBeatTime = dayjs();
|
||||
await beat();
|
||||
this.lastEndBeatTime = dayjs();
|
||||
} catch (e) {
|
||||
console.trace(e);
|
||||
UptimeKumaServer.errorLog(e, false);
|
||||
|
@ -1013,6 +1026,9 @@ class Monitor extends BeanModel {
|
|||
if (! this.isStop) {
|
||||
log.info("monitor", "Try to restart the monitor");
|
||||
this.heartbeatInterval = setTimeout(safeBeat, this.interval * 1000);
|
||||
this.lastScheduleBeatTime = dayjs();
|
||||
} else {
|
||||
log.info("monitor", "isStop = true, no next check.");
|
||||
}
|
||||
}
|
||||
};
|
||||
|
|
|
@ -12,6 +12,7 @@ const { Settings } = require("./settings");
|
|||
const dayjs = require("dayjs");
|
||||
const childProcess = require("child_process");
|
||||
const path = require("path");
|
||||
const axios = require("axios");
|
||||
// DO NOT IMPORT HERE IF THE MODULES USED `UptimeKumaServer.getInstance()`, put at the bottom of this file instead.
|
||||
|
||||
/**
|
||||
|
@ -62,6 +63,8 @@ class UptimeKumaServer {
|
|||
*/
|
||||
jwtSecret = null;
|
||||
|
||||
checkMonitorsInterval = null;
|
||||
|
||||
static getInstance(args) {
|
||||
if (UptimeKumaServer.instance == null) {
|
||||
UptimeKumaServer.instance = new UptimeKumaServer(args);
|
||||
|
@ -75,6 +78,9 @@ class UptimeKumaServer {
|
|||
const sslCert = args["ssl-cert"] || process.env.UPTIME_KUMA_SSL_CERT || process.env.SSL_CERT || undefined;
|
||||
const sslKeyPassphrase = args["ssl-key-passphrase"] || process.env.UPTIME_KUMA_SSL_KEY_PASSPHRASE || process.env.SSL_KEY_PASSPHRASE || undefined;
|
||||
|
||||
// Set default axios timeout to 5 minutes instead of infinity
|
||||
axios.defaults.timeout = 300 * 1000;
|
||||
|
||||
log.info("server", "Creating express and socket.io instance");
|
||||
this.app = express();
|
||||
if (sslKey && sslCert) {
|
||||
|
@ -346,6 +352,10 @@ class UptimeKumaServer {
|
|||
if (enable || enable === null) {
|
||||
this.startNSCDServices();
|
||||
}
|
||||
|
||||
this.checkMonitorsInterval = setInterval(() => {
|
||||
this.checkMonitors();
|
||||
}, 60 * 1000);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -358,6 +368,8 @@ class UptimeKumaServer {
|
|||
if (enable || enable === null) {
|
||||
this.stopNSCDServices();
|
||||
}
|
||||
|
||||
clearInterval(this.checkMonitorsInterval);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -388,6 +400,83 @@ class UptimeKumaServer {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Start the specified monitor
|
||||
* @param {number} monitorID ID of monitor to start
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async startMonitor(monitorID) {
|
||||
log.info("manage", `Resume Monitor: ${monitorID} by server`);
|
||||
|
||||
await R.exec("UPDATE monitor SET active = 1 WHERE id = ?", [
|
||||
monitorID,
|
||||
]);
|
||||
|
||||
let monitor = await R.findOne("monitor", " id = ? ", [
|
||||
monitorID,
|
||||
]);
|
||||
|
||||
if (monitor.id in this.monitorList) {
|
||||
this.monitorList[monitor.id].stop();
|
||||
}
|
||||
|
||||
this.monitorList[monitor.id] = monitor;
|
||||
monitor.start(this.io);
|
||||
}
|
||||
|
||||
/**
|
||||
* Restart a given monitor
|
||||
* @param {number} monitorID ID of monitor to start
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async restartMonitor(monitorID) {
|
||||
return await this.startMonitor(monitorID);
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if monitors are running properly
|
||||
*/
|
||||
async checkMonitors() {
|
||||
log.debug("monitor_checker", "Checking monitors");
|
||||
|
||||
for (let monitorID in this.monitorList) {
|
||||
let monitor = this.monitorList[monitorID];
|
||||
|
||||
// Not for push monitor
|
||||
if (monitor.type === "push") {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!monitor.active) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Check the lastStartBeatTime, if it is too long, then restart
|
||||
if (monitor.lastScheduleBeatTime ) {
|
||||
let diff = dayjs().diff(monitor.lastStartBeatTime, "second");
|
||||
|
||||
if (diff > monitor.interval * 1.5) {
|
||||
log.error("monitor_checker", `Monitor Interval: ${monitor.interval} Monitor ` + monitorID + " lastStartBeatTime diff: " + diff);
|
||||
log.error("monitor_checker", "Unexpected error: Monitor " + monitorID + " is struck for unknown reason");
|
||||
log.error("monitor_checker", "Last start beat time: " + R.isoDateTime(monitor.lastStartBeatTime));
|
||||
log.error("monitor_checker", "Last end beat time: " + R.isoDateTime(monitor.lastEndBeatTime));
|
||||
log.error("monitor_checker", "Last ScheduleBeatTime: " + R.isoDateTime(monitor.lastScheduleBeatTime));
|
||||
|
||||
// Restart
|
||||
log.error("monitor_checker", `Restarting monitor ${monitorID} automatically now`);
|
||||
this.restartMonitor(monitorID);
|
||||
} else {
|
||||
//log.debug("monitor_checker", "Monitor " + monitorID + " is running normally");
|
||||
}
|
||||
} else {
|
||||
//log.debug("monitor_checker", "Monitor " + monitorID + " is not started yet, skipp");
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
log.debug("monitor_checker", "Checking monitors end");
|
||||
}
|
||||
}
|
||||
|
||||
module.exports = {
|
||||
|
|
Loading…
Reference in New Issue