#!/bin/bash # JARVIS Self-Healing Watchdog — runs every 5 min via root cron # Checks: lsws, mysql, redis, JARVIS HTTP, disk, memory # Auto-heals: restarts failed services, restarts offline Proxmox VM agents # Logs to: /home/jarvis.orbishosting.com/logs/watchdog.log LOG=/home/jarvis.orbishosting.com/logs/watchdog.log MYSQL="mysql -u jarvis_user -pJ4rv1s_Pr0t0c0l_2026! jarvis_db -se" TS() { date '+%Y-%m-%d %H:%M:%S'; } log() { echo "[$(TS)] $1" >> "$LOG"; } # Escape single quotes for MySQL string interpolation in bash sql_esc() { printf '%s' "$1" | sed "s/'/\\\\''/g"; } alert() { local type="$1" title="$2" msg="$3" sev="${4:-warning}" local e_type e_title e_msg e_sev e_type=$(sql_esc "$type"); e_title=$(sql_esc "$title") e_msg=$(sql_esc "$msg"); e_sev=$(sql_esc "$sev") $MYSQL "INSERT IGNORE INTO alerts (alert_type,title,message,severity,source_key,auto_resolve) VALUES ('$e_type','$e_title','$e_msg','$e_sev','watchdog:$e_type',1);" 2>/dev/null } resolve() { local e_key e_key=$(sql_esc "$1") $MYSQL "UPDATE alerts SET resolved=1,resolved_at=NOW() WHERE source_key='watchdog:$e_key' AND resolved=0;" 2>/dev/null } # ── Service health ───────────────────────────────────────────────────────────── for SVC in lsws mysql redis; do if ! systemctl is-active --quiet "$SVC"; then log "HEAL: $SVC is down — restarting" systemctl restart "$SVC" if systemctl is-active --quiet "$SVC"; then log "HEAL: $SVC restarted successfully" alert "service_down" "$SVC restarted" "JARVIS watchdog restarted $SVC which was stopped." "warning" else log "ERROR: $SVC failed to restart" alert "service_down" "$SVC failed to restart" "$SVC is down and could not be restarted automatically." "critical" fi else resolve "service_down_$SVC" fi done # ── JARVIS HTTP self-check ───────────────────────────────────────────────────── HTTP_CODE=$(curl -sk -o /dev/null -w "%{http_code}" --max-time 10 https://jarvis.orbishosting.com/api.php 2>/dev/null) if [[ "$HTTP_CODE" == "5"* ]] || [[ -z "$HTTP_CODE" ]]; then log "HEAL: JARVIS HTTP returned $HTTP_CODE — restarting lsws" systemctl restart lsws alert "jarvis_http" "JARVIS HTTP error — restarted OLS" "JARVIS returned HTTP $HTTP_CODE. OpenLiteSpeed was restarted." "critical" else resolve "jarvis_http" fi # ── Disk usage ───────────────────────────────────────────────────────────────── DISK_PCT=$(df / | awk 'NR==2{print $5}' | tr -d '%') if [ "$DISK_PCT" -ge 90 ]; then log "ALERT: Disk at ${DISK_PCT}% (critical)" alert "disk_critical" "Disk ${DISK_PCT}% full on DO server" "Root filesystem is ${DISK_PCT}% full. Immediate cleanup required." "critical" elif [ "$DISK_PCT" -ge 80 ]; then log "WARN: Disk at ${DISK_PCT}%" alert "disk_warning" "Disk ${DISK_PCT}% full on DO server" "Root filesystem is ${DISK_PCT}% full." "warning" else $MYSQL "UPDATE alerts SET resolved=1,resolved_at=NOW() WHERE source_key IN ('watchdog:disk_critical','watchdog:disk_warning') AND resolved=0;" 2>/dev/null fi # ── Memory usage ────────────────────────────────────────────────────────────── MEM_TOTAL=$(grep MemTotal /proc/meminfo | awk '{print $2}') MEM_AVAIL=$(grep MemAvailable /proc/meminfo | awk '{print $2}') MEM_PCT=$(( (MEM_TOTAL - MEM_AVAIL) * 100 / MEM_TOTAL )) if [ "$MEM_PCT" -ge 90 ]; then log "ALERT: Memory at ${MEM_PCT}%" alert "mem_critical" "Memory ${MEM_PCT}% used on DO server" "DO server memory is ${MEM_PCT}% used." "critical" fi # ── Offline agent auto-restart (Proxmox VMs only) ───────────────────────────── # Map: agent_id → [proxmox_ip, vmid] declare -A AGENT_PVE=( ["ollama_vm"]="orbisne.fortiddns.com 210" ["ha_vm"]="orbisne.fortiddns.com 101" ["networkbackup_vm"]="10.48.200.91 302" ) OFFLINE=$($MYSQL "SELECT agent_id FROM registered_agents WHERE status='offline' AND last_seen < DATE_SUB(NOW(), INTERVAL 5 MINUTE) AND agent_type='linux';" 2>/dev/null) for AID in $OFFLINE; do # Check if we have a Proxmox mapping for this agent for KEY in "${!AGENT_PVE[@]}"; do if [[ "$AID" == *"$KEY"* ]] || [[ "$KEY" == *"$AID"* ]]; then PVE_INFO=(${AGENT_PVE[$KEY]}) PVE_IP="${PVE_INFO[0]}" VMID="${PVE_INFO[1]}" log "HEAL: Attempting to restart jarvis-agent on $AID (VM $VMID @ $PVE_IP)" sshpass -p 'Joker1974!!!' ssh -o StrictHostKeyChecking=no -o ConnectTimeout=5 \ root@"$PVE_IP" \ "qm guest exec $VMID -- systemctl restart jarvis-agent" 2>/dev/null log "HEAL: Restart command sent to $AID (exit: $?)" alert "agent_offline" "Auto-restarted agent: $AID" \ "Agent $AID was offline. JARVIS watchdog sent restart command via Proxmox." "warning" break fi done done # ── Deploy log rotation (keep last 1000 lines) ──────────────────────────────── for LOGFILE in "$LOG" /home/jarvis.orbishosting.com/logs/deploy.log /home/jarvis.orbishosting.com/logs/cron.log; do [ -f "$LOGFILE" ] || continue LINES=$(wc -l < "$LOGFILE") if [ "$LINES" -gt 1000 ]; then tail -500 "$LOGFILE" > "${LOGFILE}.tmp" && mv "${LOGFILE}.tmp" "$LOGFILE" fi done