From 9540f7f2e0e96dc5131337a999fab895da979bcb Mon Sep 17 00:00:00 2001 From: Maciej Pienczyn Date: Sun, 15 Mar 2026 07:53:05 +0100 Subject: [PATCH] feat: add uptime monitoring dashboard with UptimeRobot integration External monitoring via UptimeRobot (free tier) with internal health logger to differentiate ISP outages from server issues. Includes: - 4 new DB models (UptimeMonitor, UptimeCheck, UptimeIncident, InternalHealthLog) - Migration 082 with tables, indexes, and permissions - Internal health logger script (cron */5 min) - UptimeRobot sync script (cron hourly) with automatic cause correlation - Admin dashboard /admin/uptime with uptime %, response time charts, incident log with editable notes/causes, pattern analysis, monthly report - SLA comparison table (99.9%/99.5%/99%) Co-Authored-By: Claude Opus 4.6 (1M context) --- blueprints/admin/routes_status.py | 227 ++++- database.py | 72 ++ database/migrations/082_uptime_monitoring.sql | 67 ++ .../2026-03-15-uptime-monitoring-design.md | 149 ++++ scripts/internal_health_logger.py | 163 ++++ scripts/uptimerobot_sync.py | 281 +++++++ templates/admin/uptime_dashboard.html | 791 ++++++++++++++++++ templates/base.html | 6 + 8 files changed, 1755 insertions(+), 1 deletion(-) create mode 100644 database/migrations/082_uptime_monitoring.sql create mode 100644 docs/superpowers/specs/2026-03-15-uptime-monitoring-design.md create mode 100644 scripts/internal_health_logger.py create mode 100644 scripts/uptimerobot_sync.py create mode 100644 templates/admin/uptime_dashboard.html diff --git a/blueprints/admin/routes_status.py b/blueprints/admin/routes_status.py index f3f05da..ff68a9f 100644 --- a/blueprints/admin/routes_status.py +++ b/blueprints/admin/routes_status.py @@ -18,7 +18,8 @@ from sqlalchemy import func, text from . import bp from database import ( SessionLocal, Company, User, AuditLog, SecurityAlert, - CompanySocialMedia, CompanyWebsiteAnalysis, SystemRole + CompanySocialMedia, CompanyWebsiteAnalysis, SystemRole, + UptimeMonitor, UptimeCheck, UptimeIncident, InternalHealthLog ) from utils.decorators import role_required @@ -786,3 +787,227 @@ def api_admin_health(): 'health_percent': round(100 * ok_count / len(results), 1) } }) + + +# ============================================================ +# UPTIME MONITORING +# ============================================================ + +def _get_uptime_data(db, days=30): + """Pobierz dane uptime dla dashboardu""" + now = datetime.now() + data = {} + + # Aktywne monitory + monitors = db.query(UptimeMonitor).filter_by(is_active=True).all() + if not monitors: + return {'monitors': [], 'has_data': False} + + monitor = monitors[0] # Główny monitor + data['monitor'] = { + 'name': monitor.name, + 'url': monitor.url, + 'id': monitor.id + } + data['has_data'] = True + + # Ostatni check + last_check = db.query(UptimeCheck).filter_by( + monitor_id=monitor.id + ).order_by(UptimeCheck.checked_at.desc()).first() + + if last_check: + data['current_status'] = last_check.status + data['last_checked'] = last_check.checked_at.strftime('%Y-%m-%d %H:%M') + data['last_response_time'] = last_check.response_time_ms + else: + data['current_status'] = 'unknown' + data['last_checked'] = None + data['last_response_time'] = None + + # Uptime % dla różnych okresów + data['uptime'] = {} + for period_name, period_days in [('24h', 1), ('7d', 7), ('30d', 30), ('90d', 90)]: + cutoff = now - timedelta(days=period_days) + total = db.query(UptimeCheck).filter( + UptimeCheck.monitor_id == monitor.id, + UptimeCheck.checked_at >= cutoff + ).count() + up = db.query(UptimeCheck).filter( + UptimeCheck.monitor_id == monitor.id, + UptimeCheck.checked_at >= cutoff, + UptimeCheck.status == 'up' + ).count() + pct = round(100 * up / total, 3) if total > 0 else None + data['uptime'][period_name] = { + 'percent': pct, + 'total_checks': total, + 'up_checks': up, + 'down_checks': total - up if total else 0 + } + + # Response time (ostatnie N dni) + cutoff = now - timedelta(days=days) + response_times = db.query( + UptimeCheck.checked_at, + UptimeCheck.response_time_ms + ).filter( + UptimeCheck.monitor_id == monitor.id, + UptimeCheck.checked_at >= cutoff, + UptimeCheck.response_time_ms.isnot(None) + ).order_by(UptimeCheck.checked_at).all() + + data['response_times'] = [ + {'time': rt.checked_at.strftime('%Y-%m-%d %H:%M'), 'ms': rt.response_time_ms} + for rt in response_times + ] + + # Średni response time + if response_times: + avg_rt = sum(rt.response_time_ms for rt in response_times) / len(response_times) + data['avg_response_time'] = round(avg_rt) + else: + data['avg_response_time'] = None + + # Incydenty + incidents = db.query(UptimeIncident).filter( + UptimeIncident.monitor_id == monitor.id + ).order_by(UptimeIncident.started_at.desc()).limit(50).all() + + data['incidents'] = [{ + 'id': inc.id, + 'started_at': inc.started_at.strftime('%Y-%m-%d %H:%M'), + 'ended_at': inc.ended_at.strftime('%Y-%m-%d %H:%M') if inc.ended_at else None, + 'duration_seconds': inc.duration_seconds, + 'duration_human': _format_duration(inc.duration_seconds) if inc.duration_seconds else 'trwa...', + 'cause': inc.cause, + 'cause_label': {'isp': 'ISP (Chopin)', 'server': 'Serwer', 'infra': 'Infrastruktura', 'unknown': 'Nieznana'}.get(inc.cause, inc.cause), + 'notes': inc.notes or '' + } for inc in incidents] + + # Analiza wzorców — awarie wg godziny i dnia tygodnia + all_incidents = db.query(UptimeIncident).filter( + UptimeIncident.monitor_id == monitor.id + ).all() + + hour_counts = [0] * 24 + dow_counts = [0] * 7 # 0=pon, 6=nie + cause_counts = {'isp': 0, 'server': 0, 'infra': 0, 'unknown': 0} + + for inc in all_incidents: + hour_counts[inc.started_at.hour] += 1 + dow_counts[inc.started_at.weekday()] += 1 + cause_counts[inc.cause] = cause_counts.get(inc.cause, 0) + 1 + + data['patterns'] = { + 'by_hour': hour_counts, + 'by_dow': dow_counts, + 'by_cause': cause_counts + } + + # Raport miesięczny (bieżący miesiąc) + month_start = now.replace(day=1, hour=0, minute=0, second=0, microsecond=0) + month_incidents = [i for i in all_incidents if i.started_at >= month_start] + month_downtime = sum(i.duration_seconds or 0 for i in month_incidents) + days_in_month = (now - month_start).days or 1 + month_total_seconds = days_in_month * 86400 + month_uptime_pct = round(100 * (1 - month_downtime / month_total_seconds), 3) if month_total_seconds > 0 else 100 + + # Poprzedni miesiąc + prev_month_end = month_start - timedelta(seconds=1) + prev_month_start = prev_month_end.replace(day=1, hour=0, minute=0, second=0, microsecond=0) + prev_month_incidents = [i for i in all_incidents if prev_month_start <= i.started_at < month_start] + prev_month_downtime = sum(i.duration_seconds or 0 for i in prev_month_incidents) + + data['monthly_report'] = { + 'month': now.strftime('%B %Y'), + 'uptime_pct': month_uptime_pct, + 'total_downtime_seconds': month_downtime, + 'total_downtime_human': _format_duration(month_downtime), + 'incidents_count': len(month_incidents), + 'longest_incident': _format_duration(max((i.duration_seconds or 0 for i in month_incidents), default=0)), + 'prev_month': prev_month_end.strftime('%B %Y'), + 'prev_downtime_seconds': prev_month_downtime, + 'prev_downtime_human': _format_duration(prev_month_downtime), + 'prev_incidents_count': len(prev_month_incidents), + 'trend': 'better' if month_downtime < prev_month_downtime else ('worse' if month_downtime > prev_month_downtime else 'same') + } + + # SLA kontekst + data['sla_context'] = { + '99.9': {'max_downtime_month': '43 min', 'max_downtime_year': '8h 46min'}, + '99.5': {'max_downtime_month': '3h 36min', 'max_downtime_year': '1d 19h'}, + '99.0': {'max_downtime_month': '7h 18min', 'max_downtime_year': '3d 15h'}, + } + + return data + + +def _format_duration(seconds): + """Formatuj sekundy na czytelny tekst""" + if not seconds or seconds <= 0: + return '0s' + if seconds < 60: + return f'{seconds}s' + if seconds < 3600: + m = seconds // 60 + s = seconds % 60 + return f'{m}min {s}s' if s else f'{m}min' + h = seconds // 3600 + m = (seconds % 3600) // 60 + return f'{h}h {m}min' if m else f'{h}h' + + +@bp.route('/uptime') +@login_required +@role_required(SystemRole.OFFICE_MANAGER) +def admin_uptime(): + """Dashboard monitoringu uptime""" + db = SessionLocal() + try: + data = _get_uptime_data(db, days=30) + return render_template('admin/uptime_dashboard.html', data=data) + finally: + db.close() + + +@bp.route('/api/uptime') +@login_required +@role_required(SystemRole.OFFICE_MANAGER) +def api_admin_uptime(): + """API endpoint dla auto-refresh dashboardu uptime""" + db = SessionLocal() + try: + days = request.args.get('days', 30, type=int) + data = _get_uptime_data(db, days=min(days, 90)) + data['timestamp'] = datetime.now().isoformat() + data['success'] = True + return jsonify(data) + finally: + db.close() + + +@bp.route('/api/uptime/incident//notes', methods=['POST']) +@login_required +@role_required(SystemRole.OFFICE_MANAGER) +def api_update_incident_notes(incident_id): + """Aktualizuj notatki incydentu""" + db = SessionLocal() + try: + incident = db.query(UptimeIncident).get(incident_id) + if not incident: + return jsonify({'success': False, 'error': 'Incident not found'}), 404 + + data = request.get_json() + if data and 'notes' in data: + incident.notes = data['notes'] + if data and 'cause' in data and data['cause'] in ('isp', 'server', 'infra', 'unknown'): + incident.cause = data['cause'] + + db.commit() + return jsonify({'success': True}) + except Exception as e: + db.rollback() + return jsonify({'success': False, 'error': str(e)}), 500 + finally: + db.close() diff --git a/database.py b/database.py index 923f1c3..83f9a0e 100644 --- a/database.py +++ b/database.py @@ -5589,6 +5589,78 @@ class PortalSEOAudit(Base): return f'' +# ============================================================ +# UPTIME MONITORING +# ============================================================ + +class UptimeMonitor(Base): + """Konfiguracja monitorów UptimeRobot""" + __tablename__ = 'uptime_monitors' + + id = Column(Integer, primary_key=True) + uptimerobot_id = Column(Integer, unique=True, nullable=False) + name = Column(String(200), nullable=False) + url = Column(String(500), nullable=False) + check_interval_sec = Column(Integer, default=300) + is_active = Column(Boolean, default=True) + created_at = Column(DateTime, default=datetime.now) + + checks = relationship('UptimeCheck', backref='monitor', lazy='dynamic') + incidents = relationship('UptimeIncident', backref='monitor', lazy='dynamic') + + def __repr__(self): + return f'' + + +class UptimeCheck(Base): + """Wyniki sprawdzeń z UptimeRobot (synchronizowane co godzinę)""" + __tablename__ = 'uptime_checks' + + id = Column(Integer, primary_key=True) + monitor_id = Column(Integer, ForeignKey('uptime_monitors.id'), nullable=False, index=True) + checked_at = Column(DateTime, nullable=False, index=True) + status = Column(String(20), nullable=False) # 'up', 'down', 'paused' + response_time_ms = Column(Integer) + status_code = Column(Integer) + + def __repr__(self): + return f'' + + +class UptimeIncident(Base): + """Okresy niedostępności z automatyczną diagnozą przyczyny""" + __tablename__ = 'uptime_incidents' + + id = Column(Integer, primary_key=True) + monitor_id = Column(Integer, ForeignKey('uptime_monitors.id'), nullable=False, index=True) + started_at = Column(DateTime, nullable=False, index=True) + ended_at = Column(DateTime) + duration_seconds = Column(Integer) + cause = Column(String(20), default='unknown') # 'isp', 'server', 'infra', 'unknown' + notes = Column(Text) + auto_resolved = Column(Boolean, default=False) + + def __repr__(self): + return f'' + + +class InternalHealthLog(Base): + """Wewnętrzny stan serwera (cron co 5 min)""" + __tablename__ = 'internal_health_logs' + + id = Column(Integer, primary_key=True) + checked_at = Column(DateTime, nullable=False, default=datetime.now, index=True) + app_ok = Column(Boolean, nullable=False) + db_ok = Column(Boolean, nullable=False) + cpu_percent = Column(Numeric(5, 2)) + ram_percent = Column(Numeric(5, 2)) + disk_percent = Column(Numeric(5, 2)) + gunicorn_workers = Column(Integer) + + def __repr__(self): + return f'' + + # ============================================================ # DATABASE INITIALIZATION # ============================================================ diff --git a/database/migrations/082_uptime_monitoring.sql b/database/migrations/082_uptime_monitoring.sql new file mode 100644 index 0000000..ee4c5af --- /dev/null +++ b/database/migrations/082_uptime_monitoring.sql @@ -0,0 +1,67 @@ +-- Migration 082: Uptime Monitoring +-- Tabele do monitorowania dostępności portalu z zewnątrz (UptimeRobot) +-- oraz wewnętrznego stanu serwera (health logger) + +-- Konfiguracja monitorów UptimeRobot +CREATE TABLE IF NOT EXISTS uptime_monitors ( + id SERIAL PRIMARY KEY, + uptimerobot_id INTEGER UNIQUE NOT NULL, + name VARCHAR(200) NOT NULL, + url VARCHAR(500) NOT NULL, + check_interval_sec INTEGER DEFAULT 300, + is_active BOOLEAN DEFAULT TRUE, + created_at TIMESTAMP DEFAULT NOW() +); + +-- Wyniki sprawdzeń z UptimeRobot +CREATE TABLE IF NOT EXISTS uptime_checks ( + id SERIAL PRIMARY KEY, + monitor_id INTEGER NOT NULL REFERENCES uptime_monitors(id) ON DELETE CASCADE, + checked_at TIMESTAMP NOT NULL, + status VARCHAR(20) NOT NULL, + response_time_ms INTEGER, + status_code INTEGER +); + +CREATE INDEX IF NOT EXISTS idx_uptime_checks_monitor_id ON uptime_checks(monitor_id); +CREATE INDEX IF NOT EXISTS idx_uptime_checks_checked_at ON uptime_checks(checked_at); + +-- Incydenty (okresy niedostępności) +CREATE TABLE IF NOT EXISTS uptime_incidents ( + id SERIAL PRIMARY KEY, + monitor_id INTEGER NOT NULL REFERENCES uptime_monitors(id) ON DELETE CASCADE, + started_at TIMESTAMP NOT NULL, + ended_at TIMESTAMP, + duration_seconds INTEGER, + cause VARCHAR(20) DEFAULT 'unknown', + notes TEXT, + auto_resolved BOOLEAN DEFAULT FALSE +); + +CREATE INDEX IF NOT EXISTS idx_uptime_incidents_monitor_id ON uptime_incidents(monitor_id); +CREATE INDEX IF NOT EXISTS idx_uptime_incidents_started_at ON uptime_incidents(started_at); + +-- Wewnętrzny health log (stan serwera co 5 min) +CREATE TABLE IF NOT EXISTS internal_health_logs ( + id SERIAL PRIMARY KEY, + checked_at TIMESTAMP NOT NULL DEFAULT NOW(), + app_ok BOOLEAN NOT NULL, + db_ok BOOLEAN NOT NULL, + cpu_percent NUMERIC(5,2), + ram_percent NUMERIC(5,2), + disk_percent NUMERIC(5,2), + gunicorn_workers INTEGER +); + +CREATE INDEX IF NOT EXISTS idx_internal_health_logs_checked_at ON internal_health_logs(checked_at); + +-- Uprawnienia dla app usera +GRANT ALL ON TABLE uptime_monitors TO nordabiz_app; +GRANT ALL ON TABLE uptime_checks TO nordabiz_app; +GRANT ALL ON TABLE uptime_incidents TO nordabiz_app; +GRANT ALL ON TABLE internal_health_logs TO nordabiz_app; + +GRANT USAGE, SELECT ON SEQUENCE uptime_monitors_id_seq TO nordabiz_app; +GRANT USAGE, SELECT ON SEQUENCE uptime_checks_id_seq TO nordabiz_app; +GRANT USAGE, SELECT ON SEQUENCE uptime_incidents_id_seq TO nordabiz_app; +GRANT USAGE, SELECT ON SEQUENCE internal_health_logs_id_seq TO nordabiz_app; diff --git a/docs/superpowers/specs/2026-03-15-uptime-monitoring-design.md b/docs/superpowers/specs/2026-03-15-uptime-monitoring-design.md new file mode 100644 index 0000000..f56ec15 --- /dev/null +++ b/docs/superpowers/specs/2026-03-15-uptime-monitoring-design.md @@ -0,0 +1,149 @@ +# Uptime Monitoring - Design Spec + +**Data:** 2026-03-15 +**Status:** Zatwierdzony + +## Problem + +Portal nordabiznes.pl jest hostowany on-premise w INPI, za ISP Telewizja Kablowa Chopin. W ciągu ostatnich 2 miesięcy wystąpiły minimum 3 awarie internetu (10 marca, 14 marca + wcześniejszy incydent), powodujące niedostępność portalu z zewnątrz. Brak monitoringu uniemożliwia: +- Udokumentowanie skali problemu +- Odróżnienie awarii ISP od awarii serwera +- Podjęcie decyzji o ewentualnej migracji hostingu + +## Rozwiązanie + +Podejście B: UptimeRobot (zewnętrzny monitoring) + wewnętrzny health logger z korelacją awarii. + +## Architektura + +``` +UptimeRobot.com (free) NORDABIZ-01 (10.22.68.249) + │ sprawdza co 5 min │ wewnętrzny logger co 5 min + │ HTTPS → nordabiznes.pl │ app/db/cpu/ram/disk → PostgreSQL + │ │ + └── REST API ──────────────────→ │ sync co godzinę + │ korelacja: ISP vs serwer vs infra + ▼ + /admin/uptime (dashboard) +``` + +### Korelacja awarii + +| UptimeRobot | Wewnętrzny log | Diagnoza | +|---|---|---| +| DOWN | serwer OK | Awaria ISP (Chopin) | +| DOWN | serwer DOWN | Awaria serwera/VM | +| DOWN | brak logów | Awaria infrastruktury INPI | +| UP | serwer OK | Wszystko działa | + +## Schemat bazy danych + +### uptime_monitors +Konfiguracja monitorów UptimeRobot. + +| Kolumna | Typ | Opis | +|---------|-----|------| +| id | SERIAL PK | | +| uptimerobot_id | INTEGER UNIQUE | ID monitora w UptimeRobot | +| name | VARCHAR(200) | Nazwa monitora | +| url | VARCHAR(500) | Monitorowany URL | +| check_interval_sec | INTEGER | Interwał sprawdzania (300 = 5 min) | +| is_active | BOOLEAN DEFAULT TRUE | | +| created_at | TIMESTAMP | | + +### uptime_checks +Wyniki sprawdzeń z UptimeRobot (synchronizowane co godzinę). + +| Kolumna | Typ | Opis | +|---------|-----|------| +| id | SERIAL PK | | +| monitor_id | INTEGER FK | → uptime_monitors.id | +| checked_at | TIMESTAMP | Czas sprawdzenia | +| status | VARCHAR(20) | 'up' / 'down' / 'paused' | +| response_time_ms | INTEGER | Czas odpowiedzi w ms | +| status_code | INTEGER | HTTP status code | + +### uptime_incidents +Okresy niedostępności z automatyczną diagnozą przyczyny. + +| Kolumna | Typ | Opis | +|---------|-----|------| +| id | SERIAL PK | | +| monitor_id | INTEGER FK | → uptime_monitors.id | +| started_at | TIMESTAMP | Początek awarii | +| ended_at | TIMESTAMP NULL | Koniec (NULL = trwa) | +| duration_seconds | INTEGER | Czas trwania | +| cause | VARCHAR(20) | 'isp' / 'server' / 'infra' / 'unknown' | +| notes | TEXT | Notatki admina | +| auto_resolved | BOOLEAN DEFAULT FALSE | Czy zakończony automatycznie | + +### internal_health_logs +Wewnętrzny stan serwera (cron co 5 min, lokalnie). + +| Kolumna | Typ | Opis | +|---------|-----|------| +| id | SERIAL PK | | +| checked_at | TIMESTAMP | | +| app_ok | BOOLEAN | /health odpowiada OK | +| db_ok | BOOLEAN | PostgreSQL dostępny | +| cpu_percent | REAL | Użycie CPU % | +| ram_percent | REAL | Użycie RAM % | +| disk_percent | REAL | Użycie dysku % | +| gunicorn_workers | INTEGER | Liczba aktywnych workerów | + +## Skrypty + +### scripts/internal_health_logger.py +- Cron: `*/5 * * * *` +- Sprawdza: localhost:5000/health, połączenie DB, psutil (CPU/RAM/disk), pgrep gunicorn +- Zapisuje do `internal_health_logs` +- Retencja: automatyczne czyszczenie logów starszych niż 90 dni + +### scripts/uptimerobot_sync.py +- Cron: `0 * * * *` (co godzinę) +- Pobiera z UptimeRobot API: response times, logi (up/down events) +- Zapisuje do `uptime_checks` +- Tworzy/aktualizuje `uptime_incidents` na podstawie logów down/up +- Koreluje z `internal_health_logs` — ustawia `cause` automatycznie +- Env: `UPTIMEROBOT_API_KEY` w .env + +## Dashboard /admin/uptime + +### Sekcje + +1. **Aktualny status** — badge UP/DOWN, czas ostatniego sprawdzenia, response time +2. **Uptime podsumowanie** — karty 24h/7d/30d/90d z procentem i oceną SLA + - ≥99.9% zielony, 99.5-99.9% żółty, <99.5% czerwony + - Kontekst: "99.5% = max 3.6h przestoju/miesiąc" +3. **Wykres response time** — Chart.js, przełącznik 24h/7d/30d +4. **Lista incydentów** — tabela z: data, czas trwania, przyczyna (ISP/Serwer/Infra), notatki (edytowalne) +5. **Analiza wzorców** — wykres słupkowy: awarie wg godziny/dnia tygodnia +6. **Raport miesięczny** — SLA %, łączny downtime, liczba incydentów, najdłuższa awaria, trend + +### Dostęp +- Route: `/admin/uptime` +- Wymagana rola: `SystemRole.OFFICE_MANAGER` +- Auto-refresh: co 5 min (JSON API endpoint `/admin/api/uptime`) +- Link w nawigacji: sekcja System → "Monitoring uptime" + +## UptimeRobot Setup (manual) + +1. Konto na uptimerobot.com (free tier) +2. Monitor: HTTP(s), URL `https://nordabiznes.pl/health`, interwał 5 min +3. Alert contact: email +4. API key (Main API Key, read-only) → `.env` jako `UPTIMEROBOT_API_KEY` + +## Retencja danych + +| Tabela | Retencja | +|--------|----------| +| uptime_checks | 90 dni (sync script czyści starsze) | +| uptime_incidents | Bez limitu (kluczowe dla raportów) | +| internal_health_logs | 90 dni (health logger czyści starsze) | + +## Technologie + +- Backend: Flask route w `routes_status.py` +- Frontend: Jinja2 template, Chart.js (już używany w projekcie) +- Scheduled: systemowy cron (jak istniejące skrypty) +- External: UptimeRobot free API diff --git a/scripts/internal_health_logger.py b/scripts/internal_health_logger.py new file mode 100644 index 0000000..d43e9fb --- /dev/null +++ b/scripts/internal_health_logger.py @@ -0,0 +1,163 @@ +#!/usr/bin/env python3 +""" +Internal Health Logger +====================== +Cron job (*/5 * * * *) - zapisuje stan serwera co 5 minut. +Pozwala odróżnić awarię ISP od awarii serwera. + +Użycie: + */5 * * * * cd /var/www/nordabiznes && DATABASE_URL=$(grep DATABASE_URL .env | cut -d'=' -f2) /var/www/nordabiznes/venv/bin/python3 scripts/internal_health_logger.py +""" + +import os +import sys +import subprocess +import urllib.request +import urllib.error +from datetime import datetime, timedelta + +# Setup path +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from database import SessionLocal, InternalHealthLog + +HEALTH_URL = 'http://localhost:5000/health' +RETENTION_DAYS = 90 + + +def check_app_health(): + """Sprawdź czy aplikacja Flask odpowiada na /health""" + try: + req = urllib.request.Request(HEALTH_URL, method='GET') + with urllib.request.urlopen(req, timeout=5) as resp: + return resp.status == 200 + except Exception: + return False + + +def check_db_health(): + """Sprawdź czy PostgreSQL jest dostępny""" + try: + db = SessionLocal() + from sqlalchemy import text + db.execute(text('SELECT 1')) + db.close() + return True + except Exception: + return False + + +def get_cpu_percent(): + """Pobierz użycie CPU z /proc/stat lub top""" + try: + result = subprocess.run( + ['top', '-bn1'], + capture_output=True, text=True, timeout=10 + ) + for line in result.stdout.split('\n'): + if 'Cpu' in line or '%Cpu' in line: + # Format: %Cpu(s): 2.3 us, 0.5 sy, ... 96.2 id + parts = line.split() + for i, part in enumerate(parts): + if part == 'id,' or part == 'id': + idle = float(parts[i - 1]) + return round(100.0 - idle, 2) + return None + except Exception: + return None + + +def get_ram_percent(): + """Pobierz użycie RAM""" + try: + result = subprocess.run( + ['free', '-m'], + capture_output=True, text=True, timeout=5 + ) + for line in result.stdout.split('\n'): + if line.startswith('Mem:'): + parts = line.split() + total = float(parts[1]) + available = float(parts[6]) # available column + used_pct = round((1 - available / total) * 100, 2) + return used_pct + return None + except Exception: + return None + + +def get_disk_percent(): + """Pobierz użycie dysku /""" + try: + result = subprocess.run( + ['df', '-h', '/'], + capture_output=True, text=True, timeout=5 + ) + lines = result.stdout.strip().split('\n') + if len(lines) >= 2: + parts = lines[1].split() + # Format: Filesystem Size Used Avail Use% Mounted + for part in parts: + if part.endswith('%'): + return float(part.rstrip('%')) + return None + except Exception: + return None + + +def get_gunicorn_workers(): + """Policz aktywne procesy gunicorn""" + try: + result = subprocess.run( + ['pgrep', '-c', 'gunicorn'], + capture_output=True, text=True, timeout=5 + ) + return int(result.stdout.strip()) if result.returncode == 0 else 0 + except Exception: + return 0 + + +def cleanup_old_logs(db): + """Usuń logi starsze niż RETENTION_DAYS""" + cutoff = datetime.now() - timedelta(days=RETENTION_DAYS) + deleted = db.query(InternalHealthLog).filter( + InternalHealthLog.checked_at < cutoff + ).delete() + if deleted: + db.commit() + print(f"Usunięto {deleted} starych logów health (>{RETENTION_DAYS} dni)") + + +def main(): + db = SessionLocal() + try: + log = InternalHealthLog( + checked_at=datetime.now(), + app_ok=check_app_health(), + db_ok=check_db_health(), + cpu_percent=get_cpu_percent(), + ram_percent=get_ram_percent(), + disk_percent=get_disk_percent(), + gunicorn_workers=get_gunicorn_workers() + ) + db.add(log) + db.commit() + + # Cleanup co jakiś czas (sprawdź raz dziennie, przy pełnej godzinie 3:00) + now = datetime.now() + if now.hour == 3 and now.minute < 5: + cleanup_old_logs(db) + + print(f"[{log.checked_at}] app={log.app_ok} db={log.db_ok} " + f"cpu={log.cpu_percent}% ram={log.ram_percent}% disk={log.disk_percent}% " + f"workers={log.gunicorn_workers}") + + except Exception as e: + print(f"ERROR: {e}", file=sys.stderr) + db.rollback() + finally: + db.close() + + +if __name__ == '__main__': + main() diff --git a/scripts/uptimerobot_sync.py b/scripts/uptimerobot_sync.py new file mode 100644 index 0000000..67c5c43 --- /dev/null +++ b/scripts/uptimerobot_sync.py @@ -0,0 +1,281 @@ +#!/usr/bin/env python3 +""" +UptimeRobot Sync +================ +Cron job (0 * * * *) - synchronizuje dane z UptimeRobot API co godzinę. +Pobiera response times, logi up/down, koreluje z internal_health_logs. + +Użycie: + 0 * * * * cd /var/www/nordabiznes && DATABASE_URL=$(grep DATABASE_URL .env | cut -d'=' -f2) UPTIMEROBOT_API_KEY=$(grep UPTIMEROBOT_API_KEY .env | cut -d'=' -f2) /var/www/nordabiznes/venv/bin/python3 scripts/uptimerobot_sync.py + +Wymagane env: + UPTIMEROBOT_API_KEY - API key z UptimeRobot (Main API Key) +""" + +import os +import sys +import json +import urllib.request +import urllib.error +from datetime import datetime, timedelta + +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from database import ( + SessionLocal, UptimeMonitor, UptimeCheck, UptimeIncident, InternalHealthLog +) + +API_KEY = os.environ.get('UPTIMEROBOT_API_KEY', '') +API_BASE = 'https://api.uptimerobot.com/v2' +RETENTION_DAYS = 90 + +# UptimeRobot status codes +UR_STATUS = { + 0: 'paused', + 1: 'not_checked', + 2: 'up', + 8: 'seems_down', + 9: 'down', +} + + +def api_request(endpoint, extra_params=None): + """Wyślij zapytanie do UptimeRobot API v2""" + params = { + 'api_key': API_KEY, + 'format': 'json', + } + if extra_params: + params.update(extra_params) + + data = json.dumps(params).encode('utf-8') + req = urllib.request.Request( + f'{API_BASE}/{endpoint}', + data=data, + headers={'Content-Type': 'application/json'}, + method='POST' + ) + + try: + with urllib.request.urlopen(req, timeout=30) as resp: + return json.loads(resp.read().decode('utf-8')) + except Exception as e: + print(f"API error ({endpoint}): {e}", file=sys.stderr) + return None + + +def sync_monitors(db): + """Synchronizuj listę monitorów z UptimeRobot""" + result = api_request('getMonitors', { + 'response_times': 1, + 'response_times_limit': 1, + 'logs': 1, + 'logs_limit': 50, + 'all_time_uptime_ratio': 1, + 'custom_uptime_ratios': '1-7-30-90', + }) + + if not result or result.get('stat') != 'ok': + print(f"Błąd API getMonitors: {result}", file=sys.stderr) + return [] + + monitors = result.get('monitors', []) + synced = [] + + for m in monitors: + ur_id = m['id'] + + # Upsert monitor + monitor = db.query(UptimeMonitor).filter_by(uptimerobot_id=ur_id).first() + if not monitor: + monitor = UptimeMonitor( + uptimerobot_id=ur_id, + name=m.get('friendly_name', ''), + url=m.get('url', ''), + check_interval_sec=m.get('interval', 300), + created_at=datetime.now() + ) + db.add(monitor) + db.flush() + print(f"Nowy monitor: {monitor.name} ({monitor.url})") + else: + monitor.name = m.get('friendly_name', monitor.name) + monitor.url = m.get('url', monitor.url) + + # Sync response times + sync_response_times(db, monitor, m.get('response_times', [])) + + # Sync logs (up/down events) → incydenty + sync_logs(db, monitor, m.get('logs', [])) + + synced.append(monitor) + + db.commit() + return synced + + +def sync_response_times(db, monitor, response_times): + """Zapisz response times jako uptime_checks""" + if not response_times: + return + + for rt in response_times: + ts = datetime.fromtimestamp(rt['datetime']) + + # Sprawdź czy już istnieje (unikaj duplikatów) + exists = db.query(UptimeCheck).filter_by( + monitor_id=monitor.id, + checked_at=ts + ).first() + + if not exists: + check = UptimeCheck( + monitor_id=monitor.id, + checked_at=ts, + status='up', # response time = was up + response_time_ms=rt.get('value', 0), + ) + db.add(check) + + +def sync_logs(db, monitor, logs): + """Przetwórz logi up/down z UptimeRobot na incydenty""" + if not logs: + return + + for log in logs: + log_type = log.get('type', 0) + ts = datetime.fromtimestamp(log['datetime']) + duration = log.get('duration', 0) + + if log_type == 1: # DOWN + # Sprawdź czy incydent już istnieje + existing = db.query(UptimeIncident).filter( + UptimeIncident.monitor_id == monitor.id, + UptimeIncident.started_at == ts + ).first() + + if existing: + # Aktualizuj jeśli się zakończył + if duration > 0 and not existing.ended_at: + existing.ended_at = ts + timedelta(seconds=duration) + existing.duration_seconds = duration + existing.auto_resolved = True + # Koreluj przyczynę + existing.cause = correlate_cause(db, ts, duration) + continue + + ended_at = ts + timedelta(seconds=duration) if duration > 0 else None + + incident = UptimeIncident( + monitor_id=monitor.id, + started_at=ts, + ended_at=ended_at, + duration_seconds=duration if duration > 0 else None, + cause=correlate_cause(db, ts, duration) if duration > 0 else 'unknown', + auto_resolved=duration > 0 + ) + db.add(incident) + + # Dodaj check DOWN + down_check = UptimeCheck( + monitor_id=monitor.id, + checked_at=ts, + status='down', + response_time_ms=None, + ) + db.add(down_check) + + elif log_type == 2: # UP (recovery) + # Dodaj check UP + up_check = UptimeCheck( + monitor_id=monitor.id, + checked_at=ts, + status='up', + response_time_ms=None, + ) + db.add(up_check) + + +def correlate_cause(db, incident_start, duration_seconds): + """ + Koreluj incydent z wewnętrznymi logami health. + Sprawdź czy serwer działał w czasie incydentu. + + Logika: + - Jeśli internal_health_logs w oknie incydentu mają app_ok=True → ISP + - Jeśli mają app_ok=False → server + - Jeśli brak logów → infra (cały serwer padł) + """ + if not duration_seconds or duration_seconds <= 0: + return 'unknown' + + incident_end = incident_start + timedelta(seconds=duration_seconds) + + # Szukaj logów health z okna incydentu (z 5-min marginesem) + margin = timedelta(minutes=5) + health_logs = db.query(InternalHealthLog).filter( + InternalHealthLog.checked_at >= incident_start - margin, + InternalHealthLog.checked_at <= incident_end + margin + ).all() + + if not health_logs: + # Brak logów = cała infrastruktura padła (np. prąd, FortiGate) + return 'infra' + + # Sprawdź czy app działała + app_ok_count = sum(1 for h in health_logs if h.app_ok) + total = len(health_logs) + + if app_ok_count == total: + # Serwer działał normalnie → problem z internetem (ISP) + return 'isp' + elif app_ok_count == 0: + # App nie działała → problem z serwerem + return 'server' + else: + # Mieszane — częściowa awaria + return 'server' + + +def cleanup_old_checks(db): + """Usuń stare uptime_checks (>90 dni)""" + cutoff = datetime.now() - timedelta(days=RETENTION_DAYS) + deleted = db.query(UptimeCheck).filter( + UptimeCheck.checked_at < cutoff + ).delete() + if deleted: + print(f"Usunięto {deleted} starych uptime checks (>{RETENTION_DAYS} dni)") + + +def main(): + if not API_KEY: + print("BŁĄD: Brak UPTIMEROBOT_API_KEY w zmiennych środowiskowych", file=sys.stderr) + print("Ustaw klucz API w .env: UPTIMEROBOT_API_KEY=ur...", file=sys.stderr) + sys.exit(1) + + db = SessionLocal() + try: + print(f"[{datetime.now()}] Synchronizacja UptimeRobot...") + + monitors = sync_monitors(db) + print(f"Zsynchronizowano {len(monitors)} monitorów") + + # Cleanup raz dziennie (o 4:00) + now = datetime.now() + if now.hour == 4 and now.minute < 5: + cleanup_old_checks(db) + + db.commit() + print(f"[{datetime.now()}] Synchronizacja zakończona") + + except Exception as e: + print(f"ERROR: {e}", file=sys.stderr) + db.rollback() + sys.exit(1) + finally: + db.close() + + +if __name__ == '__main__': + main() diff --git a/templates/admin/uptime_dashboard.html b/templates/admin/uptime_dashboard.html new file mode 100644 index 0000000..c7f7368 --- /dev/null +++ b/templates/admin/uptime_dashboard.html @@ -0,0 +1,791 @@ +{% extends "base.html" %} + +{% block title %}Monitoring uptime - Admin{% endblock %} + +{% block extra_css %} + +{% endblock %} + +{% block content %} +
+
+

Monitoring uptime

+

Dostepnosc portalu nordabiznes.pl z perspektywy uzytkownikow zewnetrznych

+
+
+
Ostatnia aktualizacja
+
{{ now.strftime('%H:%M:%S') if now is defined else '--:--:--' }}
+
+
+ +{% if not data.has_data %} + +
+
+ + + +

Monitoring nie jest jeszcze skonfigurowany

+

Aby uruchomic monitoring, wykonaj ponizsza konfiguracje:

+
    +
  1. Zaloz konto na uptimerobot.com (darmowy plan)
  2. +
  3. Dodaj monitor: HTTPS, URL https://nordabiznes.pl/health, interwal 5 min
  4. +
  5. Skopiuj Main API Key z ustawien konta
  6. +
  7. Dodaj do .env: UPTIMEROBOT_API_KEY=twoj_klucz
  8. +
  9. Dodaj cron jobs na serwerze: +
    */5 * * * * cd /var/www/nordabiznes && ... (health logger) +
    0 * * * * cd /var/www/nordabiznes && ... (UptimeRobot sync) +
  10. +
+
+
+{% else %} + + +
+
+
+ {% if data.current_status == 'up' %} + Portal dziala poprawnie + {% elif data.current_status == 'down' %} + Portal niedostepny! + {% else %} + Status nieznany + {% endif %} +
+
+ {% if data.last_checked %} + Ostatnie sprawdzenie: {{ data.last_checked }} + {% if data.last_response_time %} | Czas odpowiedzi: {{ data.last_response_time }}ms{% endif %} + {% endif %} + | Monitor: {{ data.monitor.name }} ({{ data.monitor.url }}) +
+
+ + +
+ {% for period, label in [('24h', 'Ostatnie 24h'), ('7d', 'Ostatnie 7 dni'), ('30d', 'Ostatnie 30 dni'), ('90d', 'Ostatnie 90 dni')] %} +
+
{{ label }}
+ {% if data.uptime[period].percent is not none %} + {% set pct = data.uptime[period].percent %} +
+ {{ '%.2f' % pct }}% +
+
+ {{ data.uptime[period].down_checks }} awarii / {{ data.uptime[period].total_checks }} sprawdzen +
+ {% else %} +
--
+
Brak danych
+ {% endif %} +
+ {% endfor %} +
+ + +
+
+ + + + Czas odpowiedzi + {% if data.avg_response_time %} + + (sredni: {{ data.avg_response_time }}ms) + + {% endif %} +
+
+ + + +
+
+ +
+
+ + +
+
+ + + + Incydenty ({{ data.incidents|length }}) +
+ {% if data.incidents %} +
+ + + + + + + + + + + {% for inc in data.incidents %} + + + + + + + {% endfor %} + +
DataCzas trwaniaPrzyczynaNotatki
+ {{ inc.started_at }} + {% if inc.ended_at %}
do {{ inc.ended_at }}{% endif %} +
{{ inc.duration_human }} + + + +
+
+ {% else %} +

+ Brak zarejestrowanych incydentow +

+ {% endif %} +
+ + +
+
+ + + + Wzorce awarii +
+ {% if data.incidents %} +
+
+

Awarie wg godziny

+
+ +
+
+
+

Awarie wg dnia tygodnia

+
+ +
+
+
+ {% else %} +

Brak danych do analizy wzorcow

+ {% endif %} +
+ + +
+
+ + + + Raport miesieczny: {{ data.monthly_report.month }} +
+
+
+
+ {{ '%.3f' % data.monthly_report.uptime_pct }}% +
+
Uptime SLA
+
+
+
{{ data.monthly_report.total_downtime_human }}
+
Laczny przestoj
+
+
+
{{ data.monthly_report.incidents_count }}
+
Liczba incydentow
+
+
+
{{ data.monthly_report.longest_incident }}
+
Najdluzszy incydent
+
+
+ + +
+ Trend vs {{ data.monthly_report.prev_month }}: + + {% if data.monthly_report.trend == 'better' %}Lepiej + {% elif data.monthly_report.trend == 'worse' %}Gorzej + {% else %}Bez zmian{% endif %} + + (poprzednio: {{ data.monthly_report.prev_downtime_human }} przestoju, {{ data.monthly_report.prev_incidents_count }} incydentow) +
+ + + + + + + + + + + + + {% for level, limits in data.sla_context.items() %} + = level|float and (loop.last or data.monthly_report.uptime_pct < data.sla_context.keys()|list|sort|reverse|first|float if not loop.first else true) %}class="sla-current"{% endif %}> + + + + + + {% endfor %} + +
Poziom SLAMax przestoj / miesiacMax przestoj / rokTwoj status
{{ level }}%{{ limits.max_downtime_month }}{{ limits.max_downtime_year }} + {% if data.monthly_report.uptime_pct >= level|float %} + Spelnia + {% else %} + Nie spelnia + {% endif %} +
+
+ +{% endif %} +{% endblock %} + +{% block head_extra %} + +{% endblock %} + +{% block extra_js %} +{% if data.has_data %} +// Dane z backendu +var responseTimesData = {{ data.response_times | tojson }}; +var hourData = {{ data.patterns.by_hour | tojson }}; +var dowData = {{ data.patterns.by_dow | tojson }}; + +// Response time chart +var rtCtx = document.getElementById('responseTimeChart'); +var rtChart = null; + +function renderResponseTimeChart(days) { + if (rtChart) rtChart.destroy(); + + var cutoff = new Date(); + cutoff.setDate(cutoff.getDate() - days); + + var filtered = responseTimesData.filter(function(d) { + return new Date(d.time) >= cutoff; + }); + + var labels = filtered.map(function(d) { return d.time; }); + var values = filtered.map(function(d) { return d.ms; }); + + rtChart = new Chart(rtCtx, { + type: 'line', + data: { + labels: labels, + datasets: [{ + label: 'Response time (ms)', + data: values, + borderColor: '#3b82f6', + backgroundColor: 'rgba(59, 130, 246, 0.1)', + fill: true, + tension: 0.3, + pointRadius: days <= 1 ? 3 : (days <= 7 ? 2 : 0), + borderWidth: 2 + }] + }, + options: { + responsive: true, + maintainAspectRatio: false, + plugins: { + legend: { display: false } + }, + scales: { + x: { + display: true, + ticks: { + maxTicksLimit: 12, + font: { size: 10 } + } + }, + y: { + beginAtZero: true, + title: { + display: true, + text: 'ms' + } + } + } + } + }); +} + +function changeChartPeriod(days) { + document.querySelectorAll('.chart-btn').forEach(function(b) { + b.classList.toggle('active', parseInt(b.dataset.days) === days); + }); + renderResponseTimeChart(days); +} + +// Patterns charts +if (document.getElementById('hourChart')) { + new Chart(document.getElementById('hourChart'), { + type: 'bar', + data: { + labels: Array.from({length: 24}, function(_, i) { return i + ':00'; }), + datasets: [{ + data: hourData, + backgroundColor: 'rgba(239, 68, 68, 0.6)', + borderColor: '#ef4444', + borderWidth: 1 + }] + }, + options: { + responsive: true, + maintainAspectRatio: false, + plugins: { legend: { display: false } }, + scales: { + y: { beginAtZero: true, ticks: { stepSize: 1 } }, + x: { ticks: { font: { size: 9 } } } + } + } + }); +} + +if (document.getElementById('dowChart')) { + new Chart(document.getElementById('dowChart'), { + type: 'bar', + data: { + labels: ['Pon', 'Wt', 'Sr', 'Czw', 'Pt', 'Sob', 'Nie'], + datasets: [{ + data: dowData, + backgroundColor: 'rgba(245, 158, 11, 0.6)', + borderColor: '#f59e0b', + borderWidth: 1 + }] + }, + options: { + responsive: true, + maintainAspectRatio: false, + plugins: { legend: { display: false } }, + scales: { + y: { beginAtZero: true, ticks: { stepSize: 1 } } + } + } + }); +} + +// Init +renderResponseTimeChart(1); + +// CSRF token +var csrfToken = '{{ csrf_token() }}'; + +// Aktualizacja incydentow +function updateIncident(selectEl) { + var id = selectEl.dataset.incidentId; + var cause = selectEl.value; + fetch('/admin/api/uptime/incident/' + id + '/notes', { + method: 'POST', + headers: {'Content-Type': 'application/json', 'X-CSRFToken': csrfToken}, + body: JSON.stringify({cause: cause}) + }); +} + +function updateIncidentNotes(inputEl) { + var id = inputEl.dataset.incidentId; + var notes = inputEl.value; + fetch('/admin/api/uptime/incident/' + id + '/notes', { + method: 'POST', + headers: {'Content-Type': 'application/json', 'X-CSRFToken': csrfToken}, + body: JSON.stringify({notes: notes}) + }); +} + +// Auto-refresh co 5 min +setInterval(function() { + fetch('/admin/api/uptime') + .then(function(r) { return r.json(); }) + .then(function(data) { + if (data.success) { + document.getElementById('refresh-time').textContent = + new Date(data.timestamp).toLocaleTimeString('pl-PL'); + } + }) + .catch(function() {}); +}, 300000); +{% endif %} +{% endblock %} \ No newline at end of file diff --git a/templates/base.html b/templates/base.html index 795a817..1161782 100755 --- a/templates/base.html +++ b/templates/base.html @@ -1789,6 +1789,12 @@ Monitoring AI + + + + + Monitoring uptime +