nordabiz/scripts/internal_health_logger.py
Maciej Pienczyn 9540f7f2e0
Some checks are pending
NordaBiz Tests / E2E Tests (Playwright) (push) Blocked by required conditions
NordaBiz Tests / Smoke Tests (Production) (push) Blocked by required conditions
NordaBiz Tests / Unit & Integration Tests (push) Waiting to run
NordaBiz Tests / Send Failure Notification (push) Blocked by required conditions
feat: add uptime monitoring dashboard with UptimeRobot integration
External monitoring via UptimeRobot (free tier) with internal health
logger to differentiate ISP outages from server issues. Includes:
- 4 new DB models (UptimeMonitor, UptimeCheck, UptimeIncident, InternalHealthLog)
- Migration 082 with tables, indexes, and permissions
- Internal health logger script (cron */5 min)
- UptimeRobot sync script (cron hourly) with automatic cause correlation
- Admin dashboard /admin/uptime with uptime %, response time charts,
  incident log with editable notes/causes, pattern analysis, monthly report
- SLA comparison table (99.9%/99.5%/99%)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-15 07:53:05 +01:00

164 lines
4.7 KiB
Python

#!/usr/bin/env python3
"""
Internal Health Logger
======================
Cron job (*/5 * * * *) - zapisuje stan serwera co 5 minut.
Pozwala odróżnić awarię ISP od awarii serwera.
Użycie:
*/5 * * * * cd /var/www/nordabiznes && DATABASE_URL=$(grep DATABASE_URL .env | cut -d'=' -f2) /var/www/nordabiznes/venv/bin/python3 scripts/internal_health_logger.py
"""
import os
import sys
import subprocess
import urllib.request
import urllib.error
from datetime import datetime, timedelta
# Setup path
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from database import SessionLocal, InternalHealthLog
HEALTH_URL = 'http://localhost:5000/health'
RETENTION_DAYS = 90
def check_app_health():
"""Sprawdź czy aplikacja Flask odpowiada na /health"""
try:
req = urllib.request.Request(HEALTH_URL, method='GET')
with urllib.request.urlopen(req, timeout=5) as resp:
return resp.status == 200
except Exception:
return False
def check_db_health():
"""Sprawdź czy PostgreSQL jest dostępny"""
try:
db = SessionLocal()
from sqlalchemy import text
db.execute(text('SELECT 1'))
db.close()
return True
except Exception:
return False
def get_cpu_percent():
"""Pobierz użycie CPU z /proc/stat lub top"""
try:
result = subprocess.run(
['top', '-bn1'],
capture_output=True, text=True, timeout=10
)
for line in result.stdout.split('\n'):
if 'Cpu' in line or '%Cpu' in line:
# Format: %Cpu(s): 2.3 us, 0.5 sy, ... 96.2 id
parts = line.split()
for i, part in enumerate(parts):
if part == 'id,' or part == 'id':
idle = float(parts[i - 1])
return round(100.0 - idle, 2)
return None
except Exception:
return None
def get_ram_percent():
"""Pobierz użycie RAM"""
try:
result = subprocess.run(
['free', '-m'],
capture_output=True, text=True, timeout=5
)
for line in result.stdout.split('\n'):
if line.startswith('Mem:'):
parts = line.split()
total = float(parts[1])
available = float(parts[6]) # available column
used_pct = round((1 - available / total) * 100, 2)
return used_pct
return None
except Exception:
return None
def get_disk_percent():
"""Pobierz użycie dysku /"""
try:
result = subprocess.run(
['df', '-h', '/'],
capture_output=True, text=True, timeout=5
)
lines = result.stdout.strip().split('\n')
if len(lines) >= 2:
parts = lines[1].split()
# Format: Filesystem Size Used Avail Use% Mounted
for part in parts:
if part.endswith('%'):
return float(part.rstrip('%'))
return None
except Exception:
return None
def get_gunicorn_workers():
"""Policz aktywne procesy gunicorn"""
try:
result = subprocess.run(
['pgrep', '-c', 'gunicorn'],
capture_output=True, text=True, timeout=5
)
return int(result.stdout.strip()) if result.returncode == 0 else 0
except Exception:
return 0
def cleanup_old_logs(db):
"""Usuń logi starsze niż RETENTION_DAYS"""
cutoff = datetime.now() - timedelta(days=RETENTION_DAYS)
deleted = db.query(InternalHealthLog).filter(
InternalHealthLog.checked_at < cutoff
).delete()
if deleted:
db.commit()
print(f"Usunięto {deleted} starych logów health (>{RETENTION_DAYS} dni)")
def main():
db = SessionLocal()
try:
log = InternalHealthLog(
checked_at=datetime.now(),
app_ok=check_app_health(),
db_ok=check_db_health(),
cpu_percent=get_cpu_percent(),
ram_percent=get_ram_percent(),
disk_percent=get_disk_percent(),
gunicorn_workers=get_gunicorn_workers()
)
db.add(log)
db.commit()
# Cleanup co jakiś czas (sprawdź raz dziennie, przy pełnej godzinie 3:00)
now = datetime.now()
if now.hour == 3 and now.minute < 5:
cleanup_old_logs(db)
print(f"[{log.checked_at}] app={log.app_ok} db={log.db_ok} "
f"cpu={log.cpu_percent}% ram={log.ram_percent}% disk={log.disk_percent}% "
f"workers={log.gunicorn_workers}")
except Exception as e:
print(f"ERROR: {e}", file=sys.stderr)
db.rollback()
finally:
db.close()
if __name__ == '__main__':
main()