Autonomous systems: watchdog, smart deploy, site health, auto-heal, agent installer

- deploy/jarvis-watchdog.sh: self-healing watchdog (every 5 min)
  * monitors lsws/mysql/redis, restarts on failure
  * JARVIS HTTP self-check, restarts OLS on 5xx
  * disk/memory alerts inserted to DB
  * offline Proxmox VM agents restarted via qm guest exec
  * log rotation (1000 line cap)
- deploy/jarvis-deploy.sh: smart deploy with PHP validation
  * php8.3 syntax check on every changed .php file
  * auto-reverts git commit + inserts critical alert on syntax error
  * reloads OLS after JARVIS deploys
- api/endpoints/facts_collector.php: site health monitoring
  * curls all 7 managed sites every 3 min
  * stores up/down status in kb_facts
- api/endpoints/alerts.php: auto-heal + site alerts
  * dispatches restart_service commands when services down on agents
  * generates alerts from kb_facts site health data
- public_html/install-agent.sh: one-liner Linux agent installer
  * installs deps, downloads agent, registers with JARVIS, sets up systemd
- public_html/webhook.php: fixed infra deploy path to /opt/infra
This commit is contained in:
2026-05-25 14:08:07 +00:00
parent 3e34b6d796
commit 45fef11785
6 changed files with 352 additions and 3 deletions
+43 -2
View File
@@ -87,15 +87,56 @@ function refresh_agent_alerts(): void {
}
}
// Services down
// Services down — alert AND dispatch auto-restart command
foreach (($d['services'] ?? []) as $svc) {
if (($svc['status'] ?? '') === 'active') continue;
if (($svc['status'] ?? '') === 'unknown') continue; // not watched/installed
if (($svc['status'] ?? '') === 'unknown') continue;
$svcName = $svc['service'] ?? '';
$key = 'agent:' . $id . ':svc:' . $svcName;
upsert_alert($key, 'warning', 'Service Down: ' . $svcName . ' on ' . $hn,
$svcName . ' is ' . ($svc['status'] ?? 'inactive') . ' on ' . $hn . '.');
$still_active[$key] = true;
// Auto-dispatch restart if no pending command already queued
$pending = JarvisDB::query(
"SELECT id FROM agent_commands WHERE agent_id=? AND command_type='restart_service'
AND status IN ('pending','delivered') AND created_at > DATE_SUB(NOW(), INTERVAL 10 MINUTE)
AND JSON_EXTRACT(command_data,'$.service')=?",
[$id, $svcName]
);
if (empty($pending)) {
JarvisDB::query(
"INSERT INTO agent_commands (agent_id, command_type, command_data, status)
VALUES (?,?,?,?)",
[$id, 'restart_service', json_encode(['service' => $svcName]), 'pending']
);
}
}
}
// ── Site health alerts from kb_facts ──────────────────────────────────────
$siteKeys = ['jarvis','tomsjavajive','epictravelexp','parkersling','orbishosting','orbisportal','tomtomgames'];
$siteNames = [
'jarvis' => 'jarvis.orbishosting.com',
'tomsjavajive' => 'tomsjavajive.com',
'epictravelexp'=> 'epictravelexpeditions.com',
'parkersling' => 'parkerslingshot.epictravelexpeditions.com',
'orbishosting' => 'orbishosting.com',
'orbisportal' => 'orbis.orbishosting.com',
'tomtomgames' => 'tomtomgames.com',
];
$siteFacts = JarvisDB::query(
"SELECT fact_key, fact_value FROM kb_facts WHERE category='sites'
AND updated_at > DATE_SUB(NOW(), INTERVAL 10 MINUTE)"
);
foreach ($siteFacts as $sf) {
$skey = $sf['fact_key'];
$status = $sf['fact_value'];
$domain = $siteNames[$skey] ?? $skey;
if ($status !== 'up') {
$alertKey = 'site:' . $skey . ':down';
upsert_alert($alertKey, 'critical', 'Site Down: ' . $domain,
$domain . ' returned status ' . $status . '. Site may be unreachable.');
$still_active[$alertKey] = true;
}
}