mirror of
https://github.com/myronblair/jarvis
synced 2026-06-30 17:50:23 -05:00
Autonomous systems: watchdog, smart deploy, site health, auto-heal, agent installer
- deploy/jarvis-watchdog.sh: self-healing watchdog (every 5 min) * monitors lsws/mysql/redis, restarts on failure * JARVIS HTTP self-check, restarts OLS on 5xx * disk/memory alerts inserted to DB * offline Proxmox VM agents restarted via qm guest exec * log rotation (1000 line cap) - deploy/jarvis-deploy.sh: smart deploy with PHP validation * php8.3 syntax check on every changed .php file * auto-reverts git commit + inserts critical alert on syntax error * reloads OLS after JARVIS deploys - api/endpoints/facts_collector.php: site health monitoring * curls all 7 managed sites every 3 min * stores up/down status in kb_facts - api/endpoints/alerts.php: auto-heal + site alerts * dispatches restart_service commands when services down on agents * generates alerts from kb_facts site health data - public_html/install-agent.sh: one-liner Linux agent installer * installs deps, downloads agent, registers with JARVIS, sets up systemd - public_html/webhook.php: fixed infra deploy path to /opt/infra
This commit is contained in:
@@ -87,15 +87,56 @@ function refresh_agent_alerts(): void {
|
||||
}
|
||||
}
|
||||
|
||||
// Services down
|
||||
// Services down — alert AND dispatch auto-restart command
|
||||
foreach (($d['services'] ?? []) as $svc) {
|
||||
if (($svc['status'] ?? '') === 'active') continue;
|
||||
if (($svc['status'] ?? '') === 'unknown') continue; // not watched/installed
|
||||
if (($svc['status'] ?? '') === 'unknown') continue;
|
||||
$svcName = $svc['service'] ?? '';
|
||||
$key = 'agent:' . $id . ':svc:' . $svcName;
|
||||
upsert_alert($key, 'warning', 'Service Down: ' . $svcName . ' on ' . $hn,
|
||||
$svcName . ' is ' . ($svc['status'] ?? 'inactive') . ' on ' . $hn . '.');
|
||||
$still_active[$key] = true;
|
||||
// Auto-dispatch restart if no pending command already queued
|
||||
$pending = JarvisDB::query(
|
||||
"SELECT id FROM agent_commands WHERE agent_id=? AND command_type='restart_service'
|
||||
AND status IN ('pending','delivered') AND created_at > DATE_SUB(NOW(), INTERVAL 10 MINUTE)
|
||||
AND JSON_EXTRACT(command_data,'$.service')=?",
|
||||
[$id, $svcName]
|
||||
);
|
||||
if (empty($pending)) {
|
||||
JarvisDB::query(
|
||||
"INSERT INTO agent_commands (agent_id, command_type, command_data, status)
|
||||
VALUES (?,?,?,?)",
|
||||
[$id, 'restart_service', json_encode(['service' => $svcName]), 'pending']
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ── Site health alerts from kb_facts ──────────────────────────────────────
|
||||
$siteKeys = ['jarvis','tomsjavajive','epictravelexp','parkersling','orbishosting','orbisportal','tomtomgames'];
|
||||
$siteNames = [
|
||||
'jarvis' => 'jarvis.orbishosting.com',
|
||||
'tomsjavajive' => 'tomsjavajive.com',
|
||||
'epictravelexp'=> 'epictravelexpeditions.com',
|
||||
'parkersling' => 'parkerslingshot.epictravelexpeditions.com',
|
||||
'orbishosting' => 'orbishosting.com',
|
||||
'orbisportal' => 'orbis.orbishosting.com',
|
||||
'tomtomgames' => 'tomtomgames.com',
|
||||
];
|
||||
$siteFacts = JarvisDB::query(
|
||||
"SELECT fact_key, fact_value FROM kb_facts WHERE category='sites'
|
||||
AND updated_at > DATE_SUB(NOW(), INTERVAL 10 MINUTE)"
|
||||
);
|
||||
foreach ($siteFacts as $sf) {
|
||||
$skey = $sf['fact_key'];
|
||||
$status = $sf['fact_value'];
|
||||
$domain = $siteNames[$skey] ?? $skey;
|
||||
if ($status !== 'up') {
|
||||
$alertKey = 'site:' . $skey . ':down';
|
||||
upsert_alert($alertKey, 'critical', 'Site Down: ' . $domain,
|
||||
$domain . ' returned status ' . $status . '. Site may be unreachable.');
|
||||
$still_active[$alertKey] = true;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -278,6 +278,39 @@ function collect_all(): array {
|
||||
$results['ollama'] = 'error: ' . $e->getMessage();
|
||||
}
|
||||
|
||||
// ── Site Health ───────────────────────────────────────────────────────
|
||||
try {
|
||||
$sites = [
|
||||
'jarvis' => 'https://jarvis.orbishosting.com',
|
||||
'tomsjavajive' => 'https://tomsjavajive.com',
|
||||
'epictravelexp'=> 'https://epictravelexpeditions.com',
|
||||
'parkersling' => 'https://parkerslingshot.epictravelexpeditions.com',
|
||||
'orbishosting' => 'https://orbishosting.com',
|
||||
'orbisportal' => 'https://orbis.orbishosting.com',
|
||||
'tomtomgames' => 'https://tomtomgames.com',
|
||||
];
|
||||
$down = [];
|
||||
foreach ($sites as $key => $url) {
|
||||
$ch = curl_init($url);
|
||||
curl_setopt_array($ch, [
|
||||
CURLOPT_RETURNTRANSFER => true,
|
||||
CURLOPT_FOLLOWLOCATION => true,
|
||||
CURLOPT_TIMEOUT => 10,
|
||||
CURLOPT_CONNECTTIMEOUT => 5,
|
||||
CURLOPT_NOBODY => true,
|
||||
]);
|
||||
curl_exec($ch);
|
||||
$code = curl_getinfo($ch, CURLINFO_HTTP_CODE);
|
||||
curl_close($ch);
|
||||
$status = ($code >= 200 && $code < 400) ? 'up' : "down-$code";
|
||||
KBEngine::storeFact('sites', $key, $status, $url, 180);
|
||||
if ($status !== 'up') $down[] = "$key($code)";
|
||||
}
|
||||
$results['sites'] = empty($down) ? 'all up' : 'DOWN: ' . implode(', ', $down);
|
||||
} catch (Exception $e) {
|
||||
$results['sites'] = 'error: ' . $e->getMessage();
|
||||
}
|
||||
|
||||
return $results;
|
||||
}
|
||||
|
||||
|
||||
Executable
+74
@@ -0,0 +1,74 @@
|
||||
#!/bin/bash
|
||||
# JARVIS Auto-Deploy Runner — processes GitHub webhook queue every minute.
|
||||
# Validates PHP syntax before deploying; auto-reverts on bad code.
|
||||
# Restarts OLS after JARVIS deploys to pick up PHP changes.
|
||||
|
||||
QUEUE=/tmp/jarvis-deploy-queue.txt
|
||||
LOG=/home/jarvis.orbishosting.com/logs/deploy.log
|
||||
PHP=/usr/bin/php8.3
|
||||
|
||||
TS() { date '+%Y-%m-%d %H:%M:%S'; }
|
||||
log() { echo "[$(TS)] $1" >> "$LOG"; }
|
||||
|
||||
[ ! -f "$QUEUE" ] && exit 0
|
||||
[ ! -s "$QUEUE" ] && exit 0
|
||||
|
||||
# Snapshot and clear queue atomically
|
||||
SNAPSHOT=$(cat "$QUEUE")
|
||||
> "$QUEUE"
|
||||
|
||||
while IFS= read -r path; do
|
||||
[ -z "$path" ] && continue
|
||||
[ ! -d "$path/.git" ] && log "SKIP $path — not a git repo" && continue
|
||||
|
||||
log "Deploying $path"
|
||||
cd "$path" || continue
|
||||
|
||||
BEFORE=$(git rev-parse HEAD 2>/dev/null)
|
||||
git fetch origin main >> "$LOG" 2>&1
|
||||
REMOTE=$(git rev-parse origin/main 2>/dev/null)
|
||||
|
||||
if [ "$BEFORE" = "$REMOTE" ]; then
|
||||
log "Already up to date: $path"
|
||||
continue
|
||||
fi
|
||||
|
||||
git pull origin main >> "$LOG" 2>&1
|
||||
AFTER=$(git rev-parse HEAD 2>/dev/null)
|
||||
CHANGED=$(git diff --name-only "$BEFORE" "$AFTER" 2>/dev/null)
|
||||
|
||||
# PHP syntax validation — check every changed .php file
|
||||
SYNTAX_OK=true
|
||||
BAD_FILE=""
|
||||
while IFS= read -r f; do
|
||||
[[ "$f" != *.php ]] && continue
|
||||
[ ! -f "$f" ] && continue
|
||||
if ! $PHP -l "$f" > /dev/null 2>&1; then
|
||||
SYNTAX_OK=false
|
||||
BAD_FILE="$f"
|
||||
break
|
||||
fi
|
||||
done <<< "$CHANGED"
|
||||
|
||||
if [ "$SYNTAX_OK" = false ]; then
|
||||
log "SYNTAX ERROR in $BAD_FILE — reverting to $BEFORE"
|
||||
git reset --hard "$BEFORE" >> "$LOG" 2>&1
|
||||
# Insert alert into JARVIS DB
|
||||
mysql -u jarvis_user -pJ4rv1s_Pr0t0c0l_2026! jarvis_db -se \
|
||||
"INSERT INTO alerts (alert_type,title,message,severity)
|
||||
VALUES ('deploy_fail','Deploy reverted: syntax error',
|
||||
'PHP syntax error in $BAD_FILE. Commit $AFTER was reverted automatically.','critical');" 2>/dev/null
|
||||
log "Reverted. Bad commit: $AFTER"
|
||||
continue
|
||||
fi
|
||||
|
||||
log "Deploy OK ($BEFORE -> $AFTER): $path"
|
||||
log "Changed: $(echo "$CHANGED" | tr '\n' ' ')"
|
||||
|
||||
# Restart OLS after any JARVIS deploy to pick up PHP changes
|
||||
if [[ "$path" == *"jarvis"* ]]; then
|
||||
systemctl reload lsws 2>/dev/null || systemctl restart lsws 2>/dev/null
|
||||
log "OLS reloaded for JARVIS deploy"
|
||||
fi
|
||||
|
||||
done <<< "$SNAPSHOT"
|
||||
Executable
+110
@@ -0,0 +1,110 @@
|
||||
#!/bin/bash
|
||||
# JARVIS Self-Healing Watchdog — runs every 5 min via root cron
|
||||
# Checks: lsws, mysql, redis, JARVIS HTTP, disk, memory
|
||||
# Auto-heals: restarts failed services, restarts offline Proxmox VM agents
|
||||
# Logs to: /home/jarvis.orbishosting.com/logs/watchdog.log
|
||||
|
||||
LOG=/home/jarvis.orbishosting.com/logs/watchdog.log
|
||||
MYSQL="mysql -u jarvis_user -pJ4rv1s_Pr0t0c0l_2026! jarvis_db -se"
|
||||
TS() { date '+%Y-%m-%d %H:%M:%S'; }
|
||||
|
||||
log() { echo "[$(TS)] $1" >> "$LOG"; }
|
||||
|
||||
alert() {
|
||||
local type="$1" title="$2" msg="$3" sev="${4:-warning}"
|
||||
$MYSQL "INSERT IGNORE INTO alerts (alert_type,title,message,severity,source_key,auto_resolve)
|
||||
VALUES ('$type','$title','$msg','$sev','watchdog:$type',1);" 2>/dev/null
|
||||
}
|
||||
|
||||
resolve() {
|
||||
$MYSQL "UPDATE alerts SET resolved=1,resolved_at=NOW()
|
||||
WHERE source_key='watchdog:$1' AND resolved=0;" 2>/dev/null
|
||||
}
|
||||
|
||||
# ── Service health ─────────────────────────────────────────────────────────────
|
||||
for SVC in lsws mysql redis; do
|
||||
if ! systemctl is-active --quiet "$SVC"; then
|
||||
log "HEAL: $SVC is down — restarting"
|
||||
systemctl restart "$SVC"
|
||||
if systemctl is-active --quiet "$SVC"; then
|
||||
log "HEAL: $SVC restarted successfully"
|
||||
alert "service_down" "$SVC restarted" "JARVIS watchdog restarted $SVC which was stopped." "warning"
|
||||
else
|
||||
log "ERROR: $SVC failed to restart"
|
||||
alert "service_down" "$SVC failed to restart" "$SVC is down and could not be restarted automatically." "critical"
|
||||
fi
|
||||
else
|
||||
resolve "service_down_$SVC"
|
||||
fi
|
||||
done
|
||||
|
||||
# ── JARVIS HTTP self-check ─────────────────────────────────────────────────────
|
||||
HTTP_CODE=$(curl -sk -o /dev/null -w "%{http_code}" --max-time 10 https://jarvis.orbishosting.com/api.php 2>/dev/null)
|
||||
if [[ "$HTTP_CODE" == "5"* ]] || [[ -z "$HTTP_CODE" ]]; then
|
||||
log "HEAL: JARVIS HTTP returned $HTTP_CODE — restarting lsws"
|
||||
systemctl restart lsws
|
||||
alert "jarvis_http" "JARVIS HTTP error — restarted OLS" "JARVIS returned HTTP $HTTP_CODE. OpenLiteSpeed was restarted." "critical"
|
||||
else
|
||||
resolve "jarvis_http"
|
||||
fi
|
||||
|
||||
# ── Disk usage ─────────────────────────────────────────────────────────────────
|
||||
DISK_PCT=$(df / | awk 'NR==2{print $5}' | tr -d '%')
|
||||
if [ "$DISK_PCT" -ge 90 ]; then
|
||||
log "ALERT: Disk at ${DISK_PCT}% (critical)"
|
||||
alert "disk_critical" "Disk ${DISK_PCT}% full on DO server" "Root filesystem is ${DISK_PCT}% full. Immediate cleanup required." "critical"
|
||||
elif [ "$DISK_PCT" -ge 80 ]; then
|
||||
log "WARN: Disk at ${DISK_PCT}%"
|
||||
alert "disk_warning" "Disk ${DISK_PCT}% full on DO server" "Root filesystem is ${DISK_PCT}% full." "warning"
|
||||
else
|
||||
$MYSQL "UPDATE alerts SET resolved=1,resolved_at=NOW() WHERE source_key IN ('watchdog:disk_critical','watchdog:disk_warning') AND resolved=0;" 2>/dev/null
|
||||
fi
|
||||
|
||||
# ── Memory usage ──────────────────────────────────────────────────────────────
|
||||
MEM_TOTAL=$(grep MemTotal /proc/meminfo | awk '{print $2}')
|
||||
MEM_AVAIL=$(grep MemAvailable /proc/meminfo | awk '{print $2}')
|
||||
MEM_PCT=$(( (MEM_TOTAL - MEM_AVAIL) * 100 / MEM_TOTAL ))
|
||||
if [ "$MEM_PCT" -ge 90 ]; then
|
||||
log "ALERT: Memory at ${MEM_PCT}%"
|
||||
alert "mem_critical" "Memory ${MEM_PCT}% used on DO server" "DO server memory is ${MEM_PCT}% used." "critical"
|
||||
fi
|
||||
|
||||
# ── Offline agent auto-restart (Proxmox VMs only) ─────────────────────────────
|
||||
# Map: agent_id → [proxmox_ip, vmid]
|
||||
declare -A AGENT_PVE=(
|
||||
["ollama_vm"]="10.48.200.90 210"
|
||||
["ha_vm"]="10.48.200.90 101"
|
||||
["networkbackup_vm"]="10.48.200.91 302"
|
||||
)
|
||||
|
||||
OFFLINE=$($MYSQL "SELECT agent_id FROM registered_agents
|
||||
WHERE status='offline' AND last_seen < DATE_SUB(NOW(), INTERVAL 5 MINUTE)
|
||||
AND agent_type='linux';" 2>/dev/null)
|
||||
|
||||
for AID in $OFFLINE; do
|
||||
# Check if we have a Proxmox mapping for this agent
|
||||
for KEY in "${!AGENT_PVE[@]}"; do
|
||||
if [[ "$AID" == *"$KEY"* ]] || [[ "$KEY" == *"$AID"* ]]; then
|
||||
PVE_INFO=(${AGENT_PVE[$KEY]})
|
||||
PVE_IP="${PVE_INFO[0]}"
|
||||
VMID="${PVE_INFO[1]}"
|
||||
log "HEAL: Attempting to restart jarvis-agent on $AID (VM $VMID @ $PVE_IP)"
|
||||
sshpass -p 'Joker1974!!!' ssh -o StrictHostKeyChecking=no -o ConnectTimeout=5 \
|
||||
root@"$PVE_IP" \
|
||||
"qm guest exec $VMID -- systemctl restart jarvis-agent" 2>/dev/null
|
||||
log "HEAL: Restart command sent to $AID (exit: $?)"
|
||||
alert "agent_offline" "Auto-restarted agent: $AID" \
|
||||
"Agent $AID was offline. JARVIS watchdog sent restart command via Proxmox." "warning"
|
||||
break
|
||||
fi
|
||||
done
|
||||
done
|
||||
|
||||
# ── Deploy log rotation (keep last 1000 lines) ────────────────────────────────
|
||||
for LOGFILE in "$LOG" /home/jarvis.orbishosting.com/logs/deploy.log /home/jarvis.orbishosting.com/logs/cron.log; do
|
||||
[ -f "$LOGFILE" ] || continue
|
||||
LINES=$(wc -l < "$LOGFILE")
|
||||
if [ "$LINES" -gt 1000 ]; then
|
||||
tail -500 "$LOGFILE" > "${LOGFILE}.tmp" && mv "${LOGFILE}.tmp" "$LOGFILE"
|
||||
fi
|
||||
done
|
||||
Executable
+91
@@ -0,0 +1,91 @@
|
||||
#!/bin/bash
|
||||
# JARVIS Agent Installer — one-liner for any Linux host:
|
||||
# curl -sk https://jarvis.orbishosting.com/install-agent.sh | bash -s <hostname> <agent_type>
|
||||
#
|
||||
# agent_type: linux | proxmox | homeassistant
|
||||
# Example: curl -sk https://jarvis.orbishosting.com/install-agent.sh | bash -s myserver linux
|
||||
|
||||
set -e
|
||||
|
||||
HOSTNAME="${1:-$(hostname -s)}"
|
||||
AGENT_TYPE="${2:-linux}"
|
||||
JARVIS_URL="https://165.22.1.228"
|
||||
JARVIS_HOST="jarvis.orbishosting.com"
|
||||
INSTALL_DIR="/opt/jarvis-agent"
|
||||
SERVICE_FILE="/etc/systemd/system/jarvis-agent.service"
|
||||
|
||||
echo "=== JARVIS Agent Installer ==="
|
||||
echo "Host: $HOSTNAME | Type: $AGENT_TYPE | Server: $JARVIS_URL"
|
||||
|
||||
# ── Dependencies ──────────────────────────────────────────────────────────────
|
||||
if command -v apt-get &>/dev/null; then
|
||||
apt-get install -yq python3 python3-pip curl 2>/dev/null
|
||||
elif command -v yum &>/dev/null; then
|
||||
yum install -yq python3 python3-pip curl 2>/dev/null
|
||||
fi
|
||||
pip3 install -q requests psutil 2>/dev/null || pip install -q requests psutil 2>/dev/null
|
||||
|
||||
# ── Download agent ─────────────────────────────────────────────────────────────
|
||||
mkdir -p "$INSTALL_DIR"
|
||||
curl -sk -H "Host: $JARVIS_HOST" "$JARVIS_URL/agent/jarvis-agent.py" -o "$INSTALL_DIR/jarvis-agent.py"
|
||||
chmod +x "$INSTALL_DIR/jarvis-agent.py"
|
||||
|
||||
# ── Register with JARVIS to get API key ───────────────────────────────────────
|
||||
IP=$(hostname -I | awk '{print $1}')
|
||||
REG=$(curl -sk -H "Host: $JARVIS_HOST" \
|
||||
-H "Content-Type: application/json" \
|
||||
-H "X-Registration-Key: f846a9aaf7ce9a61742c63c87c4186052a71d2a580c65518" \
|
||||
-X POST "$JARVIS_URL/api/agent/register" \
|
||||
-d "{\"hostname\":\"$HOSTNAME\",\"agent_type\":\"$AGENT_TYPE\",\"ip_address\":\"$IP\",\"capabilities\":[\"metrics\",\"commands\"]}")
|
||||
|
||||
AGENT_ID=$(echo "$REG" | python3 -c "import sys,json; d=json.load(sys.stdin); print(d['agent_id'])" 2>/dev/null)
|
||||
API_KEY=$(echo "$REG" | python3 -c "import sys,json; d=json.load(sys.stdin); print(d['api_key'])" 2>/dev/null)
|
||||
|
||||
if [ -z "$API_KEY" ]; then
|
||||
echo "ERROR: Registration failed — $REG"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "Registered: agent_id=$AGENT_ID"
|
||||
|
||||
# ── Write config ───────────────────────────────────────────────────────────────
|
||||
cat > "$INSTALL_DIR/config.json" << EOF
|
||||
{
|
||||
"server_url": "$JARVIS_URL",
|
||||
"host_header": "$JARVIS_HOST",
|
||||
"agent_id": "$AGENT_ID",
|
||||
"api_key": "$API_KEY",
|
||||
"agent_type": "$AGENT_TYPE",
|
||||
"heartbeat_interval": 10,
|
||||
"metrics_interval": 30
|
||||
}
|
||||
EOF
|
||||
|
||||
# ── Systemd service ────────────────────────────────────────────────────────────
|
||||
cat > "$SERVICE_FILE" << EOF
|
||||
[Unit]
|
||||
Description=JARVIS Agent
|
||||
After=network-online.target
|
||||
Wants=network-online.target
|
||||
|
||||
[Service]
|
||||
Type=simple
|
||||
ExecStart=/usr/bin/python3 $INSTALL_DIR/jarvis-agent.py
|
||||
WorkingDirectory=$INSTALL_DIR
|
||||
Restart=always
|
||||
RestartSec=10
|
||||
StartLimitInterval=60
|
||||
StartLimitBurst=5
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
EOF
|
||||
|
||||
systemctl daemon-reload
|
||||
systemctl enable jarvis-agent
|
||||
systemctl restart jarvis-agent
|
||||
|
||||
echo "=== JARVIS Agent installed and running ==="
|
||||
echo "Config: $INSTALL_DIR/config.json"
|
||||
echo "Logs: journalctl -u jarvis-agent -f"
|
||||
systemctl is-active jarvis-agent
|
||||
@@ -40,7 +40,7 @@ $repoMap = [
|
||||
'orbishosting' => '/home/orbishosting.com/public_html',
|
||||
'orbis-hosting-portal' => '/home/orbis.orbishosting.com/public_html',
|
||||
'tomtomgames' => '/home/tomtomgames.com/public_html',
|
||||
'infra' => '/tmp/infra-current',
|
||||
'infra' => '/opt/infra',
|
||||
];
|
||||
|
||||
if (!isset($repoMap[$repo])) {
|
||||
|
||||
Reference in New Issue
Block a user