Autonomous systems: watchdog, smart deploy, site health, auto-heal, agent installer

- deploy/jarvis-watchdog.sh: self-healing watchdog (every 5 min)
  * monitors lsws/mysql/redis, restarts on failure
  * JARVIS HTTP self-check, restarts OLS on 5xx
  * disk/memory alerts inserted to DB
  * offline Proxmox VM agents restarted via qm guest exec
  * log rotation (1000 line cap)
- deploy/jarvis-deploy.sh: smart deploy with PHP validation
  * php8.3 syntax check on every changed .php file
  * auto-reverts git commit + inserts critical alert on syntax error
  * reloads OLS after JARVIS deploys
- api/endpoints/facts_collector.php: site health monitoring
  * curls all 7 managed sites every 3 min
  * stores up/down status in kb_facts
- api/endpoints/alerts.php: auto-heal + site alerts
  * dispatches restart_service commands when services down on agents
  * generates alerts from kb_facts site health data
- public_html/install-agent.sh: one-liner Linux agent installer
  * installs deps, downloads agent, registers with JARVIS, sets up systemd
- public_html/webhook.php: fixed infra deploy path to /opt/infra
This commit is contained in:
2026-05-25 14:08:07 +00:00
parent 3e34b6d796
commit 45fef11785
6 changed files with 352 additions and 3 deletions
+74
View File
@@ -0,0 +1,74 @@
#!/bin/bash
# JARVIS Auto-Deploy Runner — processes GitHub webhook queue every minute.
# Validates PHP syntax before deploying; auto-reverts on bad code.
# Restarts OLS after JARVIS deploys to pick up PHP changes.
QUEUE=/tmp/jarvis-deploy-queue.txt
LOG=/home/jarvis.orbishosting.com/logs/deploy.log
PHP=/usr/bin/php8.3
TS() { date '+%Y-%m-%d %H:%M:%S'; }
log() { echo "[$(TS)] $1" >> "$LOG"; }
[ ! -f "$QUEUE" ] && exit 0
[ ! -s "$QUEUE" ] && exit 0
# Snapshot and clear queue atomically
SNAPSHOT=$(cat "$QUEUE")
> "$QUEUE"
while IFS= read -r path; do
[ -z "$path" ] && continue
[ ! -d "$path/.git" ] && log "SKIP $path — not a git repo" && continue
log "Deploying $path"
cd "$path" || continue
BEFORE=$(git rev-parse HEAD 2>/dev/null)
git fetch origin main >> "$LOG" 2>&1
REMOTE=$(git rev-parse origin/main 2>/dev/null)
if [ "$BEFORE" = "$REMOTE" ]; then
log "Already up to date: $path"
continue
fi
git pull origin main >> "$LOG" 2>&1
AFTER=$(git rev-parse HEAD 2>/dev/null)
CHANGED=$(git diff --name-only "$BEFORE" "$AFTER" 2>/dev/null)
# PHP syntax validation — check every changed .php file
SYNTAX_OK=true
BAD_FILE=""
while IFS= read -r f; do
[[ "$f" != *.php ]] && continue
[ ! -f "$f" ] && continue
if ! $PHP -l "$f" > /dev/null 2>&1; then
SYNTAX_OK=false
BAD_FILE="$f"
break
fi
done <<< "$CHANGED"
if [ "$SYNTAX_OK" = false ]; then
log "SYNTAX ERROR in $BAD_FILE — reverting to $BEFORE"
git reset --hard "$BEFORE" >> "$LOG" 2>&1
# Insert alert into JARVIS DB
mysql -u jarvis_user -pJ4rv1s_Pr0t0c0l_2026! jarvis_db -se \
"INSERT INTO alerts (alert_type,title,message,severity)
VALUES ('deploy_fail','Deploy reverted: syntax error',
'PHP syntax error in $BAD_FILE. Commit $AFTER was reverted automatically.','critical');" 2>/dev/null
log "Reverted. Bad commit: $AFTER"
continue
fi
log "Deploy OK ($BEFORE -> $AFTER): $path"
log "Changed: $(echo "$CHANGED" | tr '\n' ' ')"
# Restart OLS after any JARVIS deploy to pick up PHP changes
if [[ "$path" == *"jarvis"* ]]; then
systemctl reload lsws 2>/dev/null || systemctl restart lsws 2>/dev/null
log "OLS reloaded for JARVIS deploy"
fi
done <<< "$SNAPSHOT"
+110
View File
@@ -0,0 +1,110 @@
#!/bin/bash
# JARVIS Self-Healing Watchdog — runs every 5 min via root cron
# Checks: lsws, mysql, redis, JARVIS HTTP, disk, memory
# Auto-heals: restarts failed services, restarts offline Proxmox VM agents
# Logs to: /home/jarvis.orbishosting.com/logs/watchdog.log
LOG=/home/jarvis.orbishosting.com/logs/watchdog.log
MYSQL="mysql -u jarvis_user -pJ4rv1s_Pr0t0c0l_2026! jarvis_db -se"
TS() { date '+%Y-%m-%d %H:%M:%S'; }
log() { echo "[$(TS)] $1" >> "$LOG"; }
alert() {
local type="$1" title="$2" msg="$3" sev="${4:-warning}"
$MYSQL "INSERT IGNORE INTO alerts (alert_type,title,message,severity,source_key,auto_resolve)
VALUES ('$type','$title','$msg','$sev','watchdog:$type',1);" 2>/dev/null
}
resolve() {
$MYSQL "UPDATE alerts SET resolved=1,resolved_at=NOW()
WHERE source_key='watchdog:$1' AND resolved=0;" 2>/dev/null
}
# ── Service health ─────────────────────────────────────────────────────────────
for SVC in lsws mysql redis; do
if ! systemctl is-active --quiet "$SVC"; then
log "HEAL: $SVC is down — restarting"
systemctl restart "$SVC"
if systemctl is-active --quiet "$SVC"; then
log "HEAL: $SVC restarted successfully"
alert "service_down" "$SVC restarted" "JARVIS watchdog restarted $SVC which was stopped." "warning"
else
log "ERROR: $SVC failed to restart"
alert "service_down" "$SVC failed to restart" "$SVC is down and could not be restarted automatically." "critical"
fi
else
resolve "service_down_$SVC"
fi
done
# ── JARVIS HTTP self-check ─────────────────────────────────────────────────────
HTTP_CODE=$(curl -sk -o /dev/null -w "%{http_code}" --max-time 10 https://jarvis.orbishosting.com/api.php 2>/dev/null)
if [[ "$HTTP_CODE" == "5"* ]] || [[ -z "$HTTP_CODE" ]]; then
log "HEAL: JARVIS HTTP returned $HTTP_CODE — restarting lsws"
systemctl restart lsws
alert "jarvis_http" "JARVIS HTTP error — restarted OLS" "JARVIS returned HTTP $HTTP_CODE. OpenLiteSpeed was restarted." "critical"
else
resolve "jarvis_http"
fi
# ── Disk usage ─────────────────────────────────────────────────────────────────
DISK_PCT=$(df / | awk 'NR==2{print $5}' | tr -d '%')
if [ "$DISK_PCT" -ge 90 ]; then
log "ALERT: Disk at ${DISK_PCT}% (critical)"
alert "disk_critical" "Disk ${DISK_PCT}% full on DO server" "Root filesystem is ${DISK_PCT}% full. Immediate cleanup required." "critical"
elif [ "$DISK_PCT" -ge 80 ]; then
log "WARN: Disk at ${DISK_PCT}%"
alert "disk_warning" "Disk ${DISK_PCT}% full on DO server" "Root filesystem is ${DISK_PCT}% full." "warning"
else
$MYSQL "UPDATE alerts SET resolved=1,resolved_at=NOW() WHERE source_key IN ('watchdog:disk_critical','watchdog:disk_warning') AND resolved=0;" 2>/dev/null
fi
# ── Memory usage ──────────────────────────────────────────────────────────────
MEM_TOTAL=$(grep MemTotal /proc/meminfo | awk '{print $2}')
MEM_AVAIL=$(grep MemAvailable /proc/meminfo | awk '{print $2}')
MEM_PCT=$(( (MEM_TOTAL - MEM_AVAIL) * 100 / MEM_TOTAL ))
if [ "$MEM_PCT" -ge 90 ]; then
log "ALERT: Memory at ${MEM_PCT}%"
alert "mem_critical" "Memory ${MEM_PCT}% used on DO server" "DO server memory is ${MEM_PCT}% used." "critical"
fi
# ── Offline agent auto-restart (Proxmox VMs only) ─────────────────────────────
# Map: agent_id → [proxmox_ip, vmid]
declare -A AGENT_PVE=(
["ollama_vm"]="10.48.200.90 210"
["ha_vm"]="10.48.200.90 101"
["networkbackup_vm"]="10.48.200.91 302"
)
OFFLINE=$($MYSQL "SELECT agent_id FROM registered_agents
WHERE status='offline' AND last_seen < DATE_SUB(NOW(), INTERVAL 5 MINUTE)
AND agent_type='linux';" 2>/dev/null)
for AID in $OFFLINE; do
# Check if we have a Proxmox mapping for this agent
for KEY in "${!AGENT_PVE[@]}"; do
if [[ "$AID" == *"$KEY"* ]] || [[ "$KEY" == *"$AID"* ]]; then
PVE_INFO=(${AGENT_PVE[$KEY]})
PVE_IP="${PVE_INFO[0]}"
VMID="${PVE_INFO[1]}"
log "HEAL: Attempting to restart jarvis-agent on $AID (VM $VMID @ $PVE_IP)"
sshpass -p 'Joker1974!!!' ssh -o StrictHostKeyChecking=no -o ConnectTimeout=5 \
root@"$PVE_IP" \
"qm guest exec $VMID -- systemctl restart jarvis-agent" 2>/dev/null
log "HEAL: Restart command sent to $AID (exit: $?)"
alert "agent_offline" "Auto-restarted agent: $AID" \
"Agent $AID was offline. JARVIS watchdog sent restart command via Proxmox." "warning"
break
fi
done
done
# ── Deploy log rotation (keep last 1000 lines) ────────────────────────────────
for LOGFILE in "$LOG" /home/jarvis.orbishosting.com/logs/deploy.log /home/jarvis.orbishosting.com/logs/cron.log; do
[ -f "$LOGFILE" ] || continue
LINES=$(wc -l < "$LOGFILE")
if [ "$LINES" -gt 1000 ]; then
tail -500 "$LOGFILE" > "${LOGFILE}.tmp" && mv "${LOGFILE}.tmp" "$LOGFILE"
fi
done