nordabiz/scripts/internal_health_logger.py

#!/usr/bin/env python3
"""
Internal Health Logger
======================
Cron job (*/5 * * * *) - zapisuje stan serwera co 5 minut.
Pozwala odróżnić awarię ISP od awarii serwera.

Użycie:
  */5 * * * * cd /var/www/nordabiznes && DATABASE_URL=$(grep DATABASE_URL .env | cut -d'=' -f2) /var/www/nordabiznes/venv/bin/python3 scripts/internal_health_logger.py
"""

import os
import sys
import subprocess
import urllib.request
import urllib.error
from datetime import datetime, timedelta

# Setup path
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

from database import SessionLocal, InternalHealthLog

HEALTH_URL = 'http://localhost:5000/health'
RETENTION_DAYS = 90


def check_app_health():
    """Sprawdź czy aplikacja Flask odpowiada na /health"""
    try:
        req = urllib.request.Request(HEALTH_URL, method='GET')
        with urllib.request.urlopen(req, timeout=5) as resp:
            return resp.status == 200
    except Exception:
        return False


def check_db_health():
    """Sprawdź czy PostgreSQL jest dostępny"""
    try:
        db = SessionLocal()
        from sqlalchemy import text
        db.execute(text('SELECT 1'))
        db.close()
        return True
    except Exception:
        return False


def get_cpu_percent():
    """Pobierz użycie CPU z /proc/stat lub top"""
    try:
        result = subprocess.run(
            ['top', '-bn1'],
            capture_output=True, text=True, timeout=10
        )
        for line in result.stdout.split('\n'):
            if 'Cpu' in line or '%Cpu' in line:
                # Format: %Cpu(s):  2.3 us,  0.5 sy, ...  96.2 id
                parts = line.split()
                for i, part in enumerate(parts):
                    if part == 'id,' or part == 'id':
                        idle = float(parts[i - 1])
                        return round(100.0 - idle, 2)
        return None
    except Exception:
        return None


def get_ram_percent():
    """Pobierz użycie RAM"""
    try:
        result = subprocess.run(
            ['free', '-m'],
            capture_output=True, text=True, timeout=5
        )
        for line in result.stdout.split('\n'):
            if line.startswith('Mem:'):
                parts = line.split()
                total = float(parts[1])
                available = float(parts[6])  # available column
                used_pct = round((1 - available / total) * 100, 2)
                return used_pct
        return None
    except Exception:
        return None


def get_disk_percent():
    """Pobierz użycie dysku /"""
    try:
        result = subprocess.run(
            ['df', '-h', '/'],
            capture_output=True, text=True, timeout=5
        )
        lines = result.stdout.strip().split('\n')
        if len(lines) >= 2:
            parts = lines[1].split()
            # Format: Filesystem  Size  Used  Avail  Use%  Mounted
            for part in parts:
                if part.endswith('%'):
                    return float(part.rstrip('%'))
        return None
    except Exception:
        return None


def get_gunicorn_workers():
    """Policz aktywne procesy gunicorn"""
    try:
        result = subprocess.run(
            ['pgrep', '-c', 'gunicorn'],
            capture_output=True, text=True, timeout=5
        )
        return int(result.stdout.strip()) if result.returncode == 0 else 0
    except Exception:
        return 0


def cleanup_old_logs(db):
    """Usuń logi starsze niż RETENTION_DAYS"""
    cutoff = datetime.now() - timedelta(days=RETENTION_DAYS)
    deleted = db.query(InternalHealthLog).filter(
        InternalHealthLog.checked_at < cutoff
    ).delete()
    if deleted:
        db.commit()
        print(f"Usunięto {deleted} starych logów health (>{RETENTION_DAYS} dni)")


def main():
    db = SessionLocal()
    try:
        log = InternalHealthLog(
            checked_at=datetime.now(),
            app_ok=check_app_health(),
            db_ok=check_db_health(),
            cpu_percent=get_cpu_percent(),
            ram_percent=get_ram_percent(),
            disk_percent=get_disk_percent(),
            gunicorn_workers=get_gunicorn_workers()
        )
        db.add(log)
        db.commit()

        # Cleanup co jakiś czas (sprawdź raz dziennie, przy pełnej godzinie 3:00)
        now = datetime.now()
        if now.hour == 3 and now.minute < 5:
            cleanup_old_logs(db)

        print(f"[{log.checked_at}] app={log.app_ok} db={log.db_ok} "
              f"cpu={log.cpu_percent}% ram={log.ram_percent}% disk={log.disk_percent}% "
              f"workers={log.gunicorn_workers}")

    except Exception as e:
        print(f"ERROR: {e}", file=sys.stderr)
        db.rollback()
    finally:
        db.close()


if __name__ == '__main__':
    main()