Autonomous systems: watchdog, smart deploy, site health, auto-heal, agent installer

- deploy/jarvis-watchdog.sh: self-healing watchdog (every 5 min)
  * monitors lsws/mysql/redis, restarts on failure
  * JARVIS HTTP self-check, restarts OLS on 5xx
  * disk/memory alerts inserted to DB
  * offline Proxmox VM agents restarted via qm guest exec
  * log rotation (1000 line cap)
- deploy/jarvis-deploy.sh: smart deploy with PHP validation
  * php8.3 syntax check on every changed .php file
  * auto-reverts git commit + inserts critical alert on syntax error
  * reloads OLS after JARVIS deploys
- api/endpoints/facts_collector.php: site health monitoring
  * curls all 7 managed sites every 3 min
  * stores up/down status in kb_facts
- api/endpoints/alerts.php: auto-heal + site alerts
  * dispatches restart_service commands when services down on agents
  * generates alerts from kb_facts site health data
- public_html/install-agent.sh: one-liner Linux agent installer
  * installs deps, downloads agent, registers with JARVIS, sets up systemd
- public_html/webhook.php: fixed infra deploy path to /opt/infra
This commit is contained in:
2026-05-25 14:08:07 +00:00
parent 3e34b6d796
commit 45fef11785
6 changed files with 352 additions and 3 deletions
+43 -2
View File
@@ -87,15 +87,56 @@ function refresh_agent_alerts(): void {
} }
} }
// Services down // Services down — alert AND dispatch auto-restart command
foreach (($d['services'] ?? []) as $svc) { foreach (($d['services'] ?? []) as $svc) {
if (($svc['status'] ?? '') === 'active') continue; if (($svc['status'] ?? '') === 'active') continue;
if (($svc['status'] ?? '') === 'unknown') continue; // not watched/installed if (($svc['status'] ?? '') === 'unknown') continue;
$svcName = $svc['service'] ?? ''; $svcName = $svc['service'] ?? '';
$key = 'agent:' . $id . ':svc:' . $svcName; $key = 'agent:' . $id . ':svc:' . $svcName;
upsert_alert($key, 'warning', 'Service Down: ' . $svcName . ' on ' . $hn, upsert_alert($key, 'warning', 'Service Down: ' . $svcName . ' on ' . $hn,
$svcName . ' is ' . ($svc['status'] ?? 'inactive') . ' on ' . $hn . '.'); $svcName . ' is ' . ($svc['status'] ?? 'inactive') . ' on ' . $hn . '.');
$still_active[$key] = true; $still_active[$key] = true;
// Auto-dispatch restart if no pending command already queued
$pending = JarvisDB::query(
"SELECT id FROM agent_commands WHERE agent_id=? AND command_type='restart_service'
AND status IN ('pending','delivered') AND created_at > DATE_SUB(NOW(), INTERVAL 10 MINUTE)
AND JSON_EXTRACT(command_data,'$.service')=?",
[$id, $svcName]
);
if (empty($pending)) {
JarvisDB::query(
"INSERT INTO agent_commands (agent_id, command_type, command_data, status)
VALUES (?,?,?,?)",
[$id, 'restart_service', json_encode(['service' => $svcName]), 'pending']
);
}
}
}
// ── Site health alerts from kb_facts ──────────────────────────────────────
$siteKeys = ['jarvis','tomsjavajive','epictravelexp','parkersling','orbishosting','orbisportal','tomtomgames'];
$siteNames = [
'jarvis' => 'jarvis.orbishosting.com',
'tomsjavajive' => 'tomsjavajive.com',
'epictravelexp'=> 'epictravelexpeditions.com',
'parkersling' => 'parkerslingshot.epictravelexpeditions.com',
'orbishosting' => 'orbishosting.com',
'orbisportal' => 'orbis.orbishosting.com',
'tomtomgames' => 'tomtomgames.com',
];
$siteFacts = JarvisDB::query(
"SELECT fact_key, fact_value FROM kb_facts WHERE category='sites'
AND updated_at > DATE_SUB(NOW(), INTERVAL 10 MINUTE)"
);
foreach ($siteFacts as $sf) {
$skey = $sf['fact_key'];
$status = $sf['fact_value'];
$domain = $siteNames[$skey] ?? $skey;
if ($status !== 'up') {
$alertKey = 'site:' . $skey . ':down';
upsert_alert($alertKey, 'critical', 'Site Down: ' . $domain,
$domain . ' returned status ' . $status . '. Site may be unreachable.');
$still_active[$alertKey] = true;
} }
} }
+33
View File
@@ -278,6 +278,39 @@ function collect_all(): array {
$results['ollama'] = 'error: ' . $e->getMessage(); $results['ollama'] = 'error: ' . $e->getMessage();
} }
// ── Site Health ───────────────────────────────────────────────────────
try {
$sites = [
'jarvis' => 'https://jarvis.orbishosting.com',
'tomsjavajive' => 'https://tomsjavajive.com',
'epictravelexp'=> 'https://epictravelexpeditions.com',
'parkersling' => 'https://parkerslingshot.epictravelexpeditions.com',
'orbishosting' => 'https://orbishosting.com',
'orbisportal' => 'https://orbis.orbishosting.com',
'tomtomgames' => 'https://tomtomgames.com',
];
$down = [];
foreach ($sites as $key => $url) {
$ch = curl_init($url);
curl_setopt_array($ch, [
CURLOPT_RETURNTRANSFER => true,
CURLOPT_FOLLOWLOCATION => true,
CURLOPT_TIMEOUT => 10,
CURLOPT_CONNECTTIMEOUT => 5,
CURLOPT_NOBODY => true,
]);
curl_exec($ch);
$code = curl_getinfo($ch, CURLINFO_HTTP_CODE);
curl_close($ch);
$status = ($code >= 200 && $code < 400) ? 'up' : "down-$code";
KBEngine::storeFact('sites', $key, $status, $url, 180);
if ($status !== 'up') $down[] = "$key($code)";
}
$results['sites'] = empty($down) ? 'all up' : 'DOWN: ' . implode(', ', $down);
} catch (Exception $e) {
$results['sites'] = 'error: ' . $e->getMessage();
}
return $results; return $results;
} }
+74
View File
@@ -0,0 +1,74 @@
#!/bin/bash
# JARVIS Auto-Deploy Runner — processes GitHub webhook queue every minute.
# Validates PHP syntax before deploying; auto-reverts on bad code.
# Restarts OLS after JARVIS deploys to pick up PHP changes.
QUEUE=/tmp/jarvis-deploy-queue.txt
LOG=/home/jarvis.orbishosting.com/logs/deploy.log
PHP=/usr/bin/php8.3
TS() { date '+%Y-%m-%d %H:%M:%S'; }
log() { echo "[$(TS)] $1" >> "$LOG"; }
[ ! -f "$QUEUE" ] && exit 0
[ ! -s "$QUEUE" ] && exit 0
# Snapshot and clear queue atomically
SNAPSHOT=$(cat "$QUEUE")
> "$QUEUE"
while IFS= read -r path; do
[ -z "$path" ] && continue
[ ! -d "$path/.git" ] && log "SKIP $path — not a git repo" && continue
log "Deploying $path"
cd "$path" || continue
BEFORE=$(git rev-parse HEAD 2>/dev/null)
git fetch origin main >> "$LOG" 2>&1
REMOTE=$(git rev-parse origin/main 2>/dev/null)
if [ "$BEFORE" = "$REMOTE" ]; then
log "Already up to date: $path"
continue
fi
git pull origin main >> "$LOG" 2>&1
AFTER=$(git rev-parse HEAD 2>/dev/null)
CHANGED=$(git diff --name-only "$BEFORE" "$AFTER" 2>/dev/null)
# PHP syntax validation — check every changed .php file
SYNTAX_OK=true
BAD_FILE=""
while IFS= read -r f; do
[[ "$f" != *.php ]] && continue
[ ! -f "$f" ] && continue
if ! $PHP -l "$f" > /dev/null 2>&1; then
SYNTAX_OK=false
BAD_FILE="$f"
break
fi
done <<< "$CHANGED"
if [ "$SYNTAX_OK" = false ]; then
log "SYNTAX ERROR in $BAD_FILE — reverting to $BEFORE"
git reset --hard "$BEFORE" >> "$LOG" 2>&1
# Insert alert into JARVIS DB
mysql -u jarvis_user -pJ4rv1s_Pr0t0c0l_2026! jarvis_db -se \
"INSERT INTO alerts (alert_type,title,message,severity)
VALUES ('deploy_fail','Deploy reverted: syntax error',
'PHP syntax error in $BAD_FILE. Commit $AFTER was reverted automatically.','critical');" 2>/dev/null
log "Reverted. Bad commit: $AFTER"
continue
fi
log "Deploy OK ($BEFORE -> $AFTER): $path"
log "Changed: $(echo "$CHANGED" | tr '\n' ' ')"
# Restart OLS after any JARVIS deploy to pick up PHP changes
if [[ "$path" == *"jarvis"* ]]; then
systemctl reload lsws 2>/dev/null || systemctl restart lsws 2>/dev/null
log "OLS reloaded for JARVIS deploy"
fi
done <<< "$SNAPSHOT"
+110
View File
@@ -0,0 +1,110 @@
#!/bin/bash
# JARVIS Self-Healing Watchdog — runs every 5 min via root cron
# Checks: lsws, mysql, redis, JARVIS HTTP, disk, memory
# Auto-heals: restarts failed services, restarts offline Proxmox VM agents
# Logs to: /home/jarvis.orbishosting.com/logs/watchdog.log
LOG=/home/jarvis.orbishosting.com/logs/watchdog.log
MYSQL="mysql -u jarvis_user -pJ4rv1s_Pr0t0c0l_2026! jarvis_db -se"
TS() { date '+%Y-%m-%d %H:%M:%S'; }
log() { echo "[$(TS)] $1" >> "$LOG"; }
alert() {
local type="$1" title="$2" msg="$3" sev="${4:-warning}"
$MYSQL "INSERT IGNORE INTO alerts (alert_type,title,message,severity,source_key,auto_resolve)
VALUES ('$type','$title','$msg','$sev','watchdog:$type',1);" 2>/dev/null
}
resolve() {
$MYSQL "UPDATE alerts SET resolved=1,resolved_at=NOW()
WHERE source_key='watchdog:$1' AND resolved=0;" 2>/dev/null
}
# ── Service health ─────────────────────────────────────────────────────────────
for SVC in lsws mysql redis; do
if ! systemctl is-active --quiet "$SVC"; then
log "HEAL: $SVC is down — restarting"
systemctl restart "$SVC"
if systemctl is-active --quiet "$SVC"; then
log "HEAL: $SVC restarted successfully"
alert "service_down" "$SVC restarted" "JARVIS watchdog restarted $SVC which was stopped." "warning"
else
log "ERROR: $SVC failed to restart"
alert "service_down" "$SVC failed to restart" "$SVC is down and could not be restarted automatically." "critical"
fi
else
resolve "service_down_$SVC"
fi
done
# ── JARVIS HTTP self-check ─────────────────────────────────────────────────────
HTTP_CODE=$(curl -sk -o /dev/null -w "%{http_code}" --max-time 10 https://jarvis.orbishosting.com/api.php 2>/dev/null)
if [[ "$HTTP_CODE" == "5"* ]] || [[ -z "$HTTP_CODE" ]]; then
log "HEAL: JARVIS HTTP returned $HTTP_CODE — restarting lsws"
systemctl restart lsws
alert "jarvis_http" "JARVIS HTTP error — restarted OLS" "JARVIS returned HTTP $HTTP_CODE. OpenLiteSpeed was restarted." "critical"
else
resolve "jarvis_http"
fi
# ── Disk usage ─────────────────────────────────────────────────────────────────
DISK_PCT=$(df / | awk 'NR==2{print $5}' | tr -d '%')
if [ "$DISK_PCT" -ge 90 ]; then
log "ALERT: Disk at ${DISK_PCT}% (critical)"
alert "disk_critical" "Disk ${DISK_PCT}% full on DO server" "Root filesystem is ${DISK_PCT}% full. Immediate cleanup required." "critical"
elif [ "$DISK_PCT" -ge 80 ]; then
log "WARN: Disk at ${DISK_PCT}%"
alert "disk_warning" "Disk ${DISK_PCT}% full on DO server" "Root filesystem is ${DISK_PCT}% full." "warning"
else
$MYSQL "UPDATE alerts SET resolved=1,resolved_at=NOW() WHERE source_key IN ('watchdog:disk_critical','watchdog:disk_warning') AND resolved=0;" 2>/dev/null
fi
# ── Memory usage ──────────────────────────────────────────────────────────────
MEM_TOTAL=$(grep MemTotal /proc/meminfo | awk '{print $2}')
MEM_AVAIL=$(grep MemAvailable /proc/meminfo | awk '{print $2}')
MEM_PCT=$(( (MEM_TOTAL - MEM_AVAIL) * 100 / MEM_TOTAL ))
if [ "$MEM_PCT" -ge 90 ]; then
log "ALERT: Memory at ${MEM_PCT}%"
alert "mem_critical" "Memory ${MEM_PCT}% used on DO server" "DO server memory is ${MEM_PCT}% used." "critical"
fi
# ── Offline agent auto-restart (Proxmox VMs only) ─────────────────────────────
# Map: agent_id → [proxmox_ip, vmid]
declare -A AGENT_PVE=(
["ollama_vm"]="10.48.200.90 210"
["ha_vm"]="10.48.200.90 101"
["networkbackup_vm"]="10.48.200.91 302"
)
OFFLINE=$($MYSQL "SELECT agent_id FROM registered_agents
WHERE status='offline' AND last_seen < DATE_SUB(NOW(), INTERVAL 5 MINUTE)
AND agent_type='linux';" 2>/dev/null)
for AID in $OFFLINE; do
# Check if we have a Proxmox mapping for this agent
for KEY in "${!AGENT_PVE[@]}"; do
if [[ "$AID" == *"$KEY"* ]] || [[ "$KEY" == *"$AID"* ]]; then
PVE_INFO=(${AGENT_PVE[$KEY]})
PVE_IP="${PVE_INFO[0]}"
VMID="${PVE_INFO[1]}"
log "HEAL: Attempting to restart jarvis-agent on $AID (VM $VMID @ $PVE_IP)"
sshpass -p 'Joker1974!!!' ssh -o StrictHostKeyChecking=no -o ConnectTimeout=5 \
root@"$PVE_IP" \
"qm guest exec $VMID -- systemctl restart jarvis-agent" 2>/dev/null
log "HEAL: Restart command sent to $AID (exit: $?)"
alert "agent_offline" "Auto-restarted agent: $AID" \
"Agent $AID was offline. JARVIS watchdog sent restart command via Proxmox." "warning"
break
fi
done
done
# ── Deploy log rotation (keep last 1000 lines) ────────────────────────────────
for LOGFILE in "$LOG" /home/jarvis.orbishosting.com/logs/deploy.log /home/jarvis.orbishosting.com/logs/cron.log; do
[ -f "$LOGFILE" ] || continue
LINES=$(wc -l < "$LOGFILE")
if [ "$LINES" -gt 1000 ]; then
tail -500 "$LOGFILE" > "${LOGFILE}.tmp" && mv "${LOGFILE}.tmp" "$LOGFILE"
fi
done
+91
View File
@@ -0,0 +1,91 @@
#!/bin/bash
# JARVIS Agent Installer — one-liner for any Linux host:
# curl -sk https://jarvis.orbishosting.com/install-agent.sh | bash -s <hostname> <agent_type>
#
# agent_type: linux | proxmox | homeassistant
# Example: curl -sk https://jarvis.orbishosting.com/install-agent.sh | bash -s myserver linux
set -e
HOSTNAME="${1:-$(hostname -s)}"
AGENT_TYPE="${2:-linux}"
JARVIS_URL="https://165.22.1.228"
JARVIS_HOST="jarvis.orbishosting.com"
INSTALL_DIR="/opt/jarvis-agent"
SERVICE_FILE="/etc/systemd/system/jarvis-agent.service"
echo "=== JARVIS Agent Installer ==="
echo "Host: $HOSTNAME | Type: $AGENT_TYPE | Server: $JARVIS_URL"
# ── Dependencies ──────────────────────────────────────────────────────────────
if command -v apt-get &>/dev/null; then
apt-get install -yq python3 python3-pip curl 2>/dev/null
elif command -v yum &>/dev/null; then
yum install -yq python3 python3-pip curl 2>/dev/null
fi
pip3 install -q requests psutil 2>/dev/null || pip install -q requests psutil 2>/dev/null
# ── Download agent ─────────────────────────────────────────────────────────────
mkdir -p "$INSTALL_DIR"
curl -sk -H "Host: $JARVIS_HOST" "$JARVIS_URL/agent/jarvis-agent.py" -o "$INSTALL_DIR/jarvis-agent.py"
chmod +x "$INSTALL_DIR/jarvis-agent.py"
# ── Register with JARVIS to get API key ───────────────────────────────────────
IP=$(hostname -I | awk '{print $1}')
REG=$(curl -sk -H "Host: $JARVIS_HOST" \
-H "Content-Type: application/json" \
-H "X-Registration-Key: f846a9aaf7ce9a61742c63c87c4186052a71d2a580c65518" \
-X POST "$JARVIS_URL/api/agent/register" \
-d "{\"hostname\":\"$HOSTNAME\",\"agent_type\":\"$AGENT_TYPE\",\"ip_address\":\"$IP\",\"capabilities\":[\"metrics\",\"commands\"]}")
AGENT_ID=$(echo "$REG" | python3 -c "import sys,json; d=json.load(sys.stdin); print(d['agent_id'])" 2>/dev/null)
API_KEY=$(echo "$REG" | python3 -c "import sys,json; d=json.load(sys.stdin); print(d['api_key'])" 2>/dev/null)
if [ -z "$API_KEY" ]; then
echo "ERROR: Registration failed — $REG"
exit 1
fi
echo "Registered: agent_id=$AGENT_ID"
# ── Write config ───────────────────────────────────────────────────────────────
cat > "$INSTALL_DIR/config.json" << EOF
{
"server_url": "$JARVIS_URL",
"host_header": "$JARVIS_HOST",
"agent_id": "$AGENT_ID",
"api_key": "$API_KEY",
"agent_type": "$AGENT_TYPE",
"heartbeat_interval": 10,
"metrics_interval": 30
}
EOF
# ── Systemd service ────────────────────────────────────────────────────────────
cat > "$SERVICE_FILE" << EOF
[Unit]
Description=JARVIS Agent
After=network-online.target
Wants=network-online.target
[Service]
Type=simple
ExecStart=/usr/bin/python3 $INSTALL_DIR/jarvis-agent.py
WorkingDirectory=$INSTALL_DIR
Restart=always
RestartSec=10
StartLimitInterval=60
StartLimitBurst=5
[Install]
WantedBy=multi-user.target
EOF
systemctl daemon-reload
systemctl enable jarvis-agent
systemctl restart jarvis-agent
echo "=== JARVIS Agent installed and running ==="
echo "Config: $INSTALL_DIR/config.json"
echo "Logs: journalctl -u jarvis-agent -f"
systemctl is-active jarvis-agent
+1 -1
View File
@@ -40,7 +40,7 @@ $repoMap = [
'orbishosting' => '/home/orbishosting.com/public_html', 'orbishosting' => '/home/orbishosting.com/public_html',
'orbis-hosting-portal' => '/home/orbis.orbishosting.com/public_html', 'orbis-hosting-portal' => '/home/orbis.orbishosting.com/public_html',
'tomtomgames' => '/home/tomtomgames.com/public_html', 'tomtomgames' => '/home/tomtomgames.com/public_html',
'infra' => '/tmp/infra-current', 'infra' => '/opt/infra',
]; ];
if (!isset($repoMap[$repo])) { if (!isset($repoMap[$repo])) {