Files

119 lines
5.8 KiB
Bash
Executable File

#!/bin/bash
# JARVIS Self-Healing Watchdog — runs every 5 min via root cron
# Checks: lsws, mysql, redis, JARVIS HTTP, disk, memory
# Auto-heals: restarts failed services, restarts offline Proxmox VM agents
# Logs to: /home/jarvis.orbishosting.com/logs/watchdog.log
LOG=/home/jarvis.orbishosting.com/logs/watchdog.log
MYSQL="mysql -u jarvis_user -pJ4rv1s_Pr0t0c0l_2026! jarvis_db -se"
TS() { date '+%Y-%m-%d %H:%M:%S'; }
log() { echo "[$(TS)] $1" >> "$LOG"; }
# Escape single quotes for MySQL string interpolation in bash
sql_esc() { printf '%s' "$1" | sed "s/'/\\\\''/g"; }
alert() {
local type="$1" title="$2" msg="$3" sev="${4:-warning}"
local e_type e_title e_msg e_sev
e_type=$(sql_esc "$type"); e_title=$(sql_esc "$title")
e_msg=$(sql_esc "$msg"); e_sev=$(sql_esc "$sev")
$MYSQL "INSERT IGNORE INTO alerts (alert_type,title,message,severity,source_key,auto_resolve)
VALUES ('$e_type','$e_title','$e_msg','$e_sev','watchdog:$e_type',1);" 2>/dev/null
}
resolve() {
local e_key
e_key=$(sql_esc "$1")
$MYSQL "UPDATE alerts SET resolved=1,resolved_at=NOW()
WHERE source_key='watchdog:$e_key' AND resolved=0;" 2>/dev/null
}
# ── Service health ─────────────────────────────────────────────────────────────
for SVC in lsws mysql redis; do
if ! systemctl is-active --quiet "$SVC"; then
log "HEAL: $SVC is down — restarting"
systemctl restart "$SVC"
if systemctl is-active --quiet "$SVC"; then
log "HEAL: $SVC restarted successfully"
alert "service_down" "$SVC restarted" "JARVIS watchdog restarted $SVC which was stopped." "warning"
else
log "ERROR: $SVC failed to restart"
alert "service_down" "$SVC failed to restart" "$SVC is down and could not be restarted automatically." "critical"
fi
else
resolve "service_down_$SVC"
fi
done
# ── JARVIS HTTP self-check ─────────────────────────────────────────────────────
HTTP_CODE=$(curl -sk -o /dev/null -w "%{http_code}" --max-time 10 https://jarvis.orbishosting.com/api.php 2>/dev/null)
if [[ "$HTTP_CODE" == "5"* ]] || [[ -z "$HTTP_CODE" ]]; then
log "HEAL: JARVIS HTTP returned $HTTP_CODE — restarting lsws"
systemctl restart lsws
alert "jarvis_http" "JARVIS HTTP error — restarted OLS" "JARVIS returned HTTP $HTTP_CODE. OpenLiteSpeed was restarted." "critical"
else
resolve "jarvis_http"
fi
# ── Disk usage ─────────────────────────────────────────────────────────────────
DISK_PCT=$(df / | awk 'NR==2{print $5}' | tr -d '%')
if [ "$DISK_PCT" -ge 90 ]; then
log "ALERT: Disk at ${DISK_PCT}% (critical)"
alert "disk_critical" "Disk ${DISK_PCT}% full on DO server" "Root filesystem is ${DISK_PCT}% full. Immediate cleanup required." "critical"
elif [ "$DISK_PCT" -ge 80 ]; then
log "WARN: Disk at ${DISK_PCT}%"
alert "disk_warning" "Disk ${DISK_PCT}% full on DO server" "Root filesystem is ${DISK_PCT}% full." "warning"
else
$MYSQL "UPDATE alerts SET resolved=1,resolved_at=NOW() WHERE source_key IN ('watchdog:disk_critical','watchdog:disk_warning') AND resolved=0;" 2>/dev/null
fi
# ── Memory usage ──────────────────────────────────────────────────────────────
MEM_TOTAL=$(grep MemTotal /proc/meminfo | awk '{print $2}')
MEM_AVAIL=$(grep MemAvailable /proc/meminfo | awk '{print $2}')
MEM_PCT=$(( (MEM_TOTAL - MEM_AVAIL) * 100 / MEM_TOTAL ))
if [ "$MEM_PCT" -ge 90 ]; then
log "ALERT: Memory at ${MEM_PCT}%"
alert "mem_critical" "Memory ${MEM_PCT}% used on DO server" "DO server memory is ${MEM_PCT}% used." "critical"
fi
# ── Offline agent auto-restart (Proxmox VMs only) ─────────────────────────────
# Map: agent_id → [proxmox_ip, vmid]
declare -A AGENT_PVE=(
["ollama_vm"]="orbisne.fortiddns.com 210"
["ha_vm"]="orbisne.fortiddns.com 101"
["networkbackup_vm"]="10.48.200.91 302"
)
OFFLINE=$($MYSQL "SELECT agent_id FROM registered_agents
WHERE status='offline' AND last_seen < DATE_SUB(NOW(), INTERVAL 5 MINUTE)
AND agent_type='linux';" 2>/dev/null)
for AID in $OFFLINE; do
# Check if we have a Proxmox mapping for this agent
for KEY in "${!AGENT_PVE[@]}"; do
if [[ "$AID" == *"$KEY"* ]] || [[ "$KEY" == *"$AID"* ]]; then
PVE_INFO=(${AGENT_PVE[$KEY]})
PVE_IP="${PVE_INFO[0]}"
VMID="${PVE_INFO[1]}"
log "HEAL: Attempting to restart jarvis-agent on $AID (VM $VMID @ $PVE_IP)"
sshpass -p 'Joker1974!!!' ssh -o StrictHostKeyChecking=no -o ConnectTimeout=5 \
root@"$PVE_IP" \
"qm guest exec $VMID -- systemctl restart jarvis-agent" 2>/dev/null
log "HEAL: Restart command sent to $AID (exit: $?)"
alert "agent_offline" "Auto-restarted agent: $AID" \
"Agent $AID was offline. JARVIS watchdog sent restart command via Proxmox." "warning"
break
fi
done
done
# ── Deploy log rotation (keep last 1000 lines) ────────────────────────────────
for LOGFILE in "$LOG" /home/jarvis.orbishosting.com/logs/deploy.log /home/jarvis.orbishosting.com/logs/cron.log; do
[ -f "$LOGFILE" ] || continue
LINES=$(wc -l < "$LOGFILE")
if [ "$LINES" -gt 1000 ]; then
tail -500 "$LOGFILE" > "${LOGFILE}.tmp" && mv "${LOGFILE}.tmp" "$LOGFILE"
fi
done