fix: filter bots from analytics, use audit_logs for failed logins, logarithmic engagement score
Some checks are pending
NordaBiz Tests / Unit & Integration Tests (push) Waiting to run
NordaBiz Tests / E2E Tests (Playwright) (push) Blocked by required conditions
NordaBiz Tests / Smoke Tests (Production) (push) Blocked by required conditions
NordaBiz Tests / Send Failure Notification (push) Blocked by required conditions

- Add is_bot column to user_sessions with backfill from user_agent patterns
- Update analytics_daily trigger to skip bot sessions
- Recalculate 90 days of analytics_daily without bot contamination
- Replace cumulative failed_login_attempts with time-based audit_logs queries
- Switch engagement score from linear (capped at 100) to log2 scale
- Expand section_map from 9 to 17 categories (~95% traffic coverage)
- Exclude robots.txt, sitemap.xml etc from page view tracking
- Add bot filter to all overview, pages, paths, and engagement queries

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Maciej Pienczyn 2026-02-22 08:14:50 +01:00
parent fc9d979fea
commit cca52301a6
5 changed files with 262 additions and 82 deletions

View File

@ -9,6 +9,7 @@ page popularity, user flows, and behavioral profiles.
import csv import csv
import io import io
import logging import logging
import math
from datetime import date, timedelta, datetime from datetime import date, timedelta, datetime
from flask import render_template, request, redirect, url_for, flash, Response from flask import render_template, request, redirect, url_for, flash, Response
@ -27,6 +28,21 @@ from utils.decorators import role_required
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
def _non_bot_sessions(db, start_dt=None):
"""Subquery of non-bot session IDs for filtering page_views."""
q = db.query(UserSession.id).filter(UserSession.is_bot == False)
if start_dt:
q = q.filter(UserSession.started_at >= start_dt)
return q
def _log_engagement_score(raw):
"""Logarithmic engagement score: better distribution than linear capped at 100."""
if raw <= 0:
return 0
return min(100, int(math.log2(raw + 1) * 6))
def _get_period_dates(period): def _get_period_dates(period):
"""Return (start_date, days) for given period string.""" """Return (start_date, days) for given period string."""
today = date.today() today = date.today()
@ -95,11 +111,9 @@ def _tab_problems(db, start_date, days):
User.locked_until > now, User.is_active == True User.locked_until > now, User.is_active == True
).scalar() or 0 ).scalar() or 0
failed_logins_7d = db.query( failed_logins_7d = db.query(func.count(AuditLog.id)).filter(
func.coalesce(func.sum(User.failed_login_attempts), 0) AuditLog.action == 'login_failed',
).filter( AuditLog.created_at >= start_dt
User.is_active == True,
User.failed_login_attempts > 0
).scalar() or 0 ).scalar() or 0
password_resets_7d = db.query(func.count(EmailLog.id)).filter( password_resets_7d = db.query(func.count(EmailLog.id)).filter(
@ -116,8 +130,12 @@ def _tab_problems(db, start_date, days):
problem_users = [] problem_users = []
for user in users: for user in users:
# Failed logins # Failed logins (from audit_logs, time-based)
fl = user.failed_login_attempts or 0 fl = db.query(func.count(AuditLog.id)).filter(
AuditLog.user_email == user.email,
AuditLog.action == 'login_failed',
AuditLog.created_at >= start_dt
).scalar() or 0
# Security alerts 7d # Security alerts 7d
sa_7d = db.query(func.count(SecurityAlert.id)).filter( sa_7d = db.query(func.count(SecurityAlert.id)).filter(
@ -292,7 +310,8 @@ def _tab_engagement(db, start_date, days):
# Stat cards # Stat cards
active_7d = db.query(func.count(func.distinct(UserSession.user_id))).filter( active_7d = db.query(func.count(func.distinct(UserSession.user_id))).filter(
UserSession.user_id.isnot(None), UserSession.user_id.isnot(None),
UserSession.started_at >= start_dt UserSession.started_at >= start_dt,
UserSession.is_bot == False
).scalar() or 0 ).scalar() or 0
all_users = db.query(User).filter(User.is_active == True).all() all_users = db.query(User).filter(User.is_active == True).all()
@ -321,49 +340,57 @@ def _tab_engagement(db, start_date, days):
engagement_list = [] engagement_list = []
for user in registered_users: for user in registered_users:
# Current period # Current period (exclude bots)
sessions_cur = db.query(func.count(UserSession.id)).filter( sessions_cur = db.query(func.count(UserSession.id)).filter(
UserSession.user_id == user.id, UserSession.user_id == user.id,
UserSession.started_at >= start_dt UserSession.started_at >= start_dt,
UserSession.is_bot == False
).scalar() or 0 ).scalar() or 0
pv_cur = db.query(func.count(PageView.id)).filter( pv_cur = db.query(func.count(PageView.id)).filter(
PageView.user_id == user.id, PageView.user_id == user.id,
PageView.viewed_at >= start_dt PageView.viewed_at >= start_dt,
PageView.session_id.in_(_non_bot_sessions(db, start_dt))
).scalar() or 0 ).scalar() or 0
# Previous period for WoW # Previous period for WoW
sessions_prev = db.query(func.count(UserSession.id)).filter( sessions_prev = db.query(func.count(UserSession.id)).filter(
UserSession.user_id == user.id, UserSession.user_id == user.id,
UserSession.started_at >= prev_start, UserSession.started_at >= prev_start,
UserSession.started_at < start_dt UserSession.started_at < start_dt,
UserSession.is_bot == False
).scalar() or 0 ).scalar() or 0
pv_prev = db.query(func.count(PageView.id)).filter( pv_prev = db.query(func.count(PageView.id)).filter(
PageView.user_id == user.id, PageView.user_id == user.id,
PageView.viewed_at >= prev_start, PageView.viewed_at >= prev_start,
PageView.viewed_at < start_dt PageView.viewed_at < start_dt,
PageView.session_id.in_(_non_bot_sessions(db, prev_start))
).scalar() or 0 ).scalar() or 0
# 30d engagement score components # 30d engagement score components (exclude bots)
s30 = db.query(func.count(UserSession.id)).filter( s30 = db.query(func.count(UserSession.id)).filter(
UserSession.user_id == user.id, UserSession.user_id == user.id,
UserSession.started_at >= start_30d UserSession.started_at >= start_30d,
UserSession.is_bot == False
).scalar() or 0 ).scalar() or 0
pv30 = db.query(func.count(PageView.id)).filter( pv30 = db.query(func.count(PageView.id)).filter(
PageView.user_id == user.id, PageView.user_id == user.id,
PageView.viewed_at >= start_30d PageView.viewed_at >= start_30d,
PageView.session_id.in_(_non_bot_sessions(db, start_30d))
).scalar() or 0 ).scalar() or 0
clicks30 = db.query(func.sum(UserSession.clicks_count)).filter( clicks30 = db.query(func.sum(UserSession.clicks_count)).filter(
UserSession.user_id == user.id, UserSession.user_id == user.id,
UserSession.started_at >= start_30d UserSession.started_at >= start_30d,
UserSession.is_bot == False
).scalar() or 0 ).scalar() or 0
dur30 = db.query(func.sum(UserSession.duration_seconds)).filter( dur30 = db.query(func.sum(UserSession.duration_seconds)).filter(
UserSession.user_id == user.id, UserSession.user_id == user.id,
UserSession.started_at >= start_30d UserSession.started_at >= start_30d,
UserSession.is_bot == False
).scalar() or 0 ).scalar() or 0
conv30 = db.query(func.count(ConversionEvent.id)).filter( conv30 = db.query(func.count(ConversionEvent.id)).filter(
@ -376,11 +403,9 @@ def _tab_engagement(db, start_date, days):
SearchQuery.searched_at >= start_30d SearchQuery.searched_at >= start_30d
).scalar() or 0 ).scalar() or 0
score = min(100, raw = (s30 * 3 + pv30 * 1 + int(clicks30) * 0.5 +
s30 * 3 + pv30 * 1 + int(clicks30) * 0.5 + int(dur30) / 60 * 2 + conv30 * 10 + search30 * 2)
int(dur30) / 60 * 2 + conv30 * 10 + search30 * 2 score = _log_engagement_score(raw)
)
score = int(score)
# WoW change # WoW change
wow = None wow = None
@ -444,16 +469,24 @@ def _tab_pages(db, start_date, days):
"""Page popularity map.""" """Page popularity map."""
start_dt = datetime.combine(start_date, datetime.min.time()) start_dt = datetime.combine(start_date, datetime.min.time())
# Page sections with grouping # Page sections with grouping (expanded to cover ~95% of traffic)
section_map = { section_map = {
'Strona główna': ['/'], 'Strona główna': ['/'],
'Profile firm': ['/company/'], 'Profile firm': ['/company/'],
'Forum': ['/forum'], 'Forum': ['/forum'],
'Chat': ['/chat'], 'Chat': ['/chat'],
'Wyszukiwarka': ['/search', '/szukaj'], 'Wyszukiwarka': ['/search', '/szukaj'],
'Wydarzenia': ['/events', '/wydarzenia'], 'Wydarzenia': ['/events', '/wydarzenia', '/kalendarz'],
'Ogłoszenia': ['/classifieds', '/ogloszenia'], 'Ogłoszenia': ['/classifieds', '/ogloszenia', '/tablica'],
'Członkostwo': ['/membership', '/czlonkostwo'], 'Członkostwo': ['/membership', '/czlonkostwo', '/korzysci'],
'Logowanie': ['/login', '/register', '/forgot-password', '/reset-password', '/verify-email'],
'Panel użytkownika': ['/dashboard', '/konto'],
'Wiadomości': ['/wiadomosci'],
'Edukacja': ['/edukacja'],
'Rada': ['/rada'],
'ZOPK': ['/zopk'],
'Kontakty': ['/kontakty'],
'Raporty': ['/raporty'],
'Admin': ['/admin'], 'Admin': ['/admin'],
} }
@ -467,9 +500,10 @@ def _tab_pages(db, start_date, days):
func.count(PageView.id).label('views'), func.count(PageView.id).label('views'),
func.count(func.distinct(PageView.user_id)).label('unique_users'), func.count(func.distinct(PageView.user_id)).label('unique_users'),
func.avg(PageView.time_on_page_seconds).label('avg_time') func.avg(PageView.time_on_page_seconds).label('avg_time')
).filter( ).join(UserSession, PageView.session_id == UserSession.id).filter(
or_(*conditions), or_(*conditions),
PageView.viewed_at >= start_dt PageView.viewed_at >= start_dt,
UserSession.is_bot == False
).first() ).first()
sections.append({ sections.append({
@ -484,7 +518,7 @@ def _tab_pages(db, start_date, days):
for s in sections: for s in sections:
s['intensity'] = min(100, int(s['views'] / max_views * 100)) s['intensity'] = min(100, int(s['views'] / max_views * 100))
# Top 50 pages # Top 50 pages (exclude bots)
top_pages = db.query( top_pages = db.query(
PageView.path, PageView.path,
func.count(PageView.id).label('views'), func.count(PageView.id).label('views'),
@ -492,8 +526,9 @@ def _tab_pages(db, start_date, days):
func.avg(PageView.time_on_page_seconds).label('avg_time'), func.avg(PageView.time_on_page_seconds).label('avg_time'),
func.avg(PageView.scroll_depth_percent).label('avg_scroll'), func.avg(PageView.scroll_depth_percent).label('avg_scroll'),
func.avg(PageView.load_time_ms).label('avg_load'), func.avg(PageView.load_time_ms).label('avg_load'),
).filter( ).join(UserSession, PageView.session_id == UserSession.id).filter(
PageView.viewed_at >= start_dt PageView.viewed_at >= start_dt,
UserSession.is_bot == False
).group_by(PageView.path).order_by(desc('views')).limit(50).all() ).group_by(PageView.path).order_by(desc('views')).limit(50).all()
max_page_views = top_pages[0].views if top_pages else 1 max_page_views = top_pages[0].views if top_pages else 1
@ -510,13 +545,14 @@ def _tab_pages(db, start_date, days):
'bar_pct': int(p.views / max_page_views * 100), 'bar_pct': int(p.views / max_page_views * 100),
}) })
# Ignored pages (< 5 views in 30d) # Ignored pages (< 5 views in 30d, exclude bots)
start_30d = datetime.combine(date.today() - timedelta(days=30), datetime.min.time()) start_30d = datetime.combine(date.today() - timedelta(days=30), datetime.min.time())
ignored = db.query( ignored = db.query(
PageView.path, PageView.path,
func.count(PageView.id).label('views'), func.count(PageView.id).label('views'),
).filter( ).join(UserSession, PageView.session_id == UserSession.id).filter(
PageView.viewed_at >= start_30d PageView.viewed_at >= start_30d,
UserSession.is_bot == False
).group_by(PageView.path).having( ).group_by(PageView.path).having(
func.count(PageView.id) < 5 func.count(PageView.id) < 5
).order_by('views').limit(30).all() ).order_by('views').limit(30).all()
@ -536,13 +572,14 @@ def _tab_paths(db, start_date, days):
"""User flow analysis.""" """User flow analysis."""
start_dt = datetime.combine(start_date, datetime.min.time()) start_dt = datetime.combine(start_date, datetime.min.time())
# Entry pages - first page in each session # Entry pages - first page in each session (exclude bots)
entry_sql = text(""" entry_sql = text("""
WITH first_pages AS ( WITH first_pages AS (
SELECT DISTINCT ON (session_id) path SELECT DISTINCT ON (pv.session_id) pv.path
FROM page_views FROM page_views pv
WHERE viewed_at >= :start_dt AND session_id IS NOT NULL JOIN user_sessions us ON pv.session_id = us.id
ORDER BY session_id, viewed_at ASC WHERE pv.viewed_at >= :start_dt AND pv.session_id IS NOT NULL AND us.is_bot = false
ORDER BY pv.session_id, pv.viewed_at ASC
) )
SELECT path, COUNT(*) as cnt SELECT path, COUNT(*) as cnt
FROM first_pages FROM first_pages
@ -550,13 +587,14 @@ def _tab_paths(db, start_date, days):
""") """)
entry_pages = db.execute(entry_sql, {'start_dt': start_dt}).fetchall() entry_pages = db.execute(entry_sql, {'start_dt': start_dt}).fetchall()
# Exit pages - last page in each session # Exit pages - last page in each session (exclude bots)
exit_sql = text(""" exit_sql = text("""
WITH last_pages AS ( WITH last_pages AS (
SELECT DISTINCT ON (session_id) path SELECT DISTINCT ON (pv.session_id) pv.path
FROM page_views FROM page_views pv
WHERE viewed_at >= :start_dt AND session_id IS NOT NULL JOIN user_sessions us ON pv.session_id = us.id
ORDER BY session_id, viewed_at DESC WHERE pv.viewed_at >= :start_dt AND pv.session_id IS NOT NULL AND us.is_bot = false
ORDER BY pv.session_id, pv.viewed_at DESC
) )
SELECT path, COUNT(*) as cnt SELECT path, COUNT(*) as cnt
FROM last_pages FROM last_pages
@ -567,13 +605,14 @@ def _tab_paths(db, start_date, days):
max_entry = entry_pages[0].cnt if entry_pages else 1 max_entry = entry_pages[0].cnt if entry_pages else 1
max_exit = exit_pages[0].cnt if exit_pages else 1 max_exit = exit_pages[0].cnt if exit_pages else 1
# Top transitions # Top transitions (exclude bots)
transitions_sql = text(""" transitions_sql = text("""
WITH ordered AS ( WITH ordered AS (
SELECT session_id, path, SELECT pv.session_id, pv.path,
LEAD(path) OVER (PARTITION BY session_id ORDER BY viewed_at) AS next_path LEAD(pv.path) OVER (PARTITION BY pv.session_id ORDER BY pv.viewed_at) AS next_path
FROM page_views FROM page_views pv
WHERE viewed_at >= :start_dt AND session_id IS NOT NULL JOIN user_sessions us ON pv.session_id = us.id
WHERE pv.viewed_at >= :start_dt AND pv.session_id IS NOT NULL AND us.is_bot = false
) )
SELECT path, next_path, COUNT(*) as cnt SELECT path, next_path, COUNT(*) as cnt
FROM ordered FROM ordered
@ -582,21 +621,23 @@ def _tab_paths(db, start_date, days):
""") """)
transitions = db.execute(transitions_sql, {'start_dt': start_dt}).fetchall() transitions = db.execute(transitions_sql, {'start_dt': start_dt}).fetchall()
# Drop-off pages (high exit rate) # Drop-off pages (high exit rate, exclude bots)
dropoff_sql = text(""" dropoff_sql = text("""
WITH page_stats AS ( WITH page_stats AS (
SELECT path, COUNT(*) as total_views SELECT pv.path, COUNT(*) as total_views
FROM page_views FROM page_views pv
WHERE viewed_at >= :start_dt AND session_id IS NOT NULL JOIN user_sessions us ON pv.session_id = us.id
GROUP BY path HAVING COUNT(*) >= 5 WHERE pv.viewed_at >= :start_dt AND pv.session_id IS NOT NULL AND us.is_bot = false
GROUP BY pv.path HAVING COUNT(*) >= 5
), ),
exit_stats AS ( exit_stats AS (
SELECT path, COUNT(*) as exit_count SELECT path, COUNT(*) as exit_count
FROM ( FROM (
SELECT DISTINCT ON (session_id) path SELECT DISTINCT ON (pv.session_id) pv.path
FROM page_views FROM page_views pv
WHERE viewed_at >= :start_dt AND session_id IS NOT NULL JOIN user_sessions us ON pv.session_id = us.id
ORDER BY session_id, viewed_at DESC WHERE pv.viewed_at >= :start_dt AND pv.session_id IS NOT NULL AND us.is_bot = false
ORDER BY pv.session_id, pv.viewed_at DESC
) lp ) lp
GROUP BY path GROUP BY path
) )
@ -609,7 +650,7 @@ def _tab_paths(db, start_date, days):
""") """)
dropoff = db.execute(dropoff_sql, {'start_dt': start_dt}).fetchall() dropoff = db.execute(dropoff_sql, {'start_dt': start_dt}).fetchall()
# Session length distribution # Session length distribution (exclude bots)
session_length_sql = text(""" session_length_sql = text("""
SELECT SELECT
CASE CASE
@ -621,10 +662,11 @@ def _tab_paths(db, start_date, days):
END as bucket, END as bucket,
COUNT(*) as cnt COUNT(*) as cnt
FROM ( FROM (
SELECT session_id, COUNT(*) as pv_count SELECT pv.session_id, COUNT(*) as pv_count
FROM page_views FROM page_views pv
WHERE viewed_at >= :start_dt AND session_id IS NOT NULL JOIN user_sessions us ON pv.session_id = us.id
GROUP BY session_id WHERE pv.viewed_at >= :start_dt AND pv.session_id IS NOT NULL AND us.is_bot = false
GROUP BY pv.session_id
) session_counts ) session_counts
GROUP BY bucket GROUP BY bucket
ORDER BY MIN(pv_count) ORDER BY MIN(pv_count)
@ -651,14 +693,13 @@ def _tab_overview(db, start_date, days):
start_dt = datetime.combine(start_date, datetime.min.time()) start_dt = datetime.combine(start_date, datetime.min.time())
start_30d = datetime.combine(date.today() - timedelta(days=30), datetime.min.time()) start_30d = datetime.combine(date.today() - timedelta(days=30), datetime.min.time())
# Daily sessions + page views (30d) # Daily sessions from analytics_daily (already bot-filtered after migration)
daily_data = db.query(AnalyticsDaily).filter( daily_data = db.query(AnalyticsDaily).filter(
AnalyticsDaily.date >= date.today() - timedelta(days=30) AnalyticsDaily.date >= date.today() - timedelta(days=30)
).order_by(AnalyticsDaily.date).all() ).order_by(AnalyticsDaily.date).all()
chart_labels = [] chart_labels = []
chart_sessions = [] chart_sessions = []
chart_pageviews = []
for d in daily_data: for d in daily_data:
chart_labels.append(d.date.strftime('%d.%m')) chart_labels.append(d.date.strftime('%d.%m'))
if filter_type == 'logged': if filter_type == 'logged':
@ -667,15 +708,36 @@ def _tab_overview(db, start_date, days):
chart_sessions.append(d.anonymous_sessions or 0) chart_sessions.append(d.anonymous_sessions or 0)
else: else:
chart_sessions.append(d.total_sessions or 0) chart_sessions.append(d.total_sessions or 0)
chart_pageviews.append(d.total_page_views or 0)
# Hourly heatmap (7 days x 24 hours) # Daily page views from raw PageView + JOIN (bot-filtered, supports logged/anon filter)
pv_filter = [
PageView.viewed_at >= start_30d,
UserSession.is_bot == False,
]
if filter_type == 'logged':
pv_filter.append(UserSession.user_id.isnot(None))
elif filter_type == 'anonymous':
pv_filter.append(UserSession.user_id.is_(None))
pv_daily = db.query(
func.date(PageView.viewed_at).label('day'),
func.count(PageView.id).label('cnt')
).join(UserSession, PageView.session_id == UserSession.id).filter(
*pv_filter
).group_by(func.date(PageView.viewed_at)).all()
pv_by_date = {str(r.day): r.cnt for r in pv_daily}
chart_pageviews = []
for d in daily_data:
chart_pageviews.append(pv_by_date.get(str(d.date), 0))
# Hourly heatmap (7 days x 24 hours, exclude bots)
heatmap_sql = text(""" heatmap_sql = text("""
SELECT EXTRACT(DOW FROM started_at)::int as dow, SELECT EXTRACT(DOW FROM started_at)::int as dow,
EXTRACT(HOUR FROM started_at)::int as hour, EXTRACT(HOUR FROM started_at)::int as hour,
COUNT(*) as cnt COUNT(*) as cnt
FROM user_sessions FROM user_sessions
WHERE started_at >= :start_dt WHERE started_at >= :start_dt AND is_bot = false
GROUP BY dow, hour GROUP BY dow, hour
""") """)
heatmap_raw = db.execute(heatmap_sql, {'start_dt': start_30d}).fetchall() heatmap_raw = db.execute(heatmap_sql, {'start_dt': start_30d}).fetchall()
@ -697,23 +759,25 @@ def _tab_overview(db, start_date, days):
row['hours'].append({'count': cnt, 'intensity': intensity}) row['hours'].append({'count': cnt, 'intensity': intensity})
heatmap_grid.append(row) heatmap_grid.append(row)
# Logged vs Anonymous # Logged vs Anonymous (exclude bots)
total_logged = db.query(func.count(UserSession.id)).filter( total_logged = db.query(func.count(UserSession.id)).filter(
UserSession.started_at >= start_30d, UserSession.started_at >= start_30d,
UserSession.user_id.isnot(None) UserSession.user_id.isnot(None),
UserSession.is_bot == False
).scalar() or 0 ).scalar() or 0
total_anon = db.query(func.count(UserSession.id)).filter( total_anon = db.query(func.count(UserSession.id)).filter(
UserSession.started_at >= start_30d, UserSession.started_at >= start_30d,
UserSession.user_id.is_(None) UserSession.user_id.is_(None),
UserSession.is_bot == False
).scalar() or 0 ).scalar() or 0
# Devices over time (weekly) # Devices over time (weekly, exclude bots)
devices_sql = text(""" devices_sql = text("""
SELECT DATE_TRUNC('week', started_at)::date as week, SELECT DATE_TRUNC('week', started_at)::date as week,
device_type, device_type,
COUNT(*) as cnt COUNT(*) as cnt
FROM user_sessions FROM user_sessions
WHERE started_at >= :start_dt WHERE started_at >= :start_dt AND is_bot = false
GROUP BY week, device_type GROUP BY week, device_type
ORDER BY week ORDER BY week
""") """)
@ -793,13 +857,16 @@ def user_insights_profile(user_id):
SearchQuery.user_id == user_id, SearchQuery.searched_at >= start_30d SearchQuery.user_id == user_id, SearchQuery.searched_at >= start_30d
).scalar() or 0 ).scalar() or 0
engagement_score = min(100, int( raw = (s30 * 3 + pv30 * 1 + int(clicks30) * 0.5 +
s30 * 3 + pv30 * 1 + int(clicks30) * 0.5 + int(dur30) / 60 * 2 + conv30 * 10 + search30 * 2)
int(dur30) / 60 * 2 + conv30 * 10 + search30 * 2 engagement_score = _log_engagement_score(raw)
))
# Problem score # Problem score (failed logins from audit_logs, time-based)
fl = user.failed_login_attempts or 0 fl = db.query(func.count(AuditLog.id)).filter(
AuditLog.user_email == user.email,
AuditLog.action == 'login_failed',
AuditLog.created_at >= start_7d
).scalar() or 0
sa_7d = db.query(func.count(SecurityAlert.id)).filter( sa_7d = db.query(func.count(SecurityAlert.id)).filter(
SecurityAlert.user_email == user.email, SecurityAlert.user_email == user.email,
SecurityAlert.created_at >= start_7d SecurityAlert.created_at >= start_7d
@ -1166,7 +1233,7 @@ def user_insights_profile(user_id):
PageView.viewed_at < d_end PageView.viewed_at < d_end
).scalar() or 0 ).scalar() or 0
daily_score = min(30, d_sessions * 3 + d_pv) daily_score = _log_engagement_score(d_sessions * 3 + d_pv)
trend_labels.append(d.strftime('%d.%m')) trend_labels.append(d.strftime('%d.%m'))
trend_scores.append(daily_score) trend_scores.append(daily_score)

View File

@ -4144,6 +4144,9 @@ class UserSession(Base):
page_views_count = Column(Integer, default=0) page_views_count = Column(Integer, default=0)
clicks_count = Column(Integer, default=0) clicks_count = Column(Integer, default=0)
# Bot detection
is_bot = Column(Boolean, default=False)
# UTM Parameters (kampanie marketingowe) # UTM Parameters (kampanie marketingowe)
utm_source = Column(String(255), nullable=True) # google, facebook, newsletter utm_source = Column(String(255), nullable=True) # google, facebook, newsletter
utm_medium = Column(String(255), nullable=True) # cpc, email, social, organic utm_medium = Column(String(255), nullable=True) # cpc, email, social, organic

View File

@ -0,0 +1,98 @@
-- Migration 079: Bot Filtering for Analytics
-- Adds is_bot column to user_sessions, backfills from user_agent patterns,
-- updates analytics_daily trigger to exclude bots, recalculates 90 days of data.
-- 1. Add column
ALTER TABLE user_sessions ADD COLUMN IF NOT EXISTS is_bot BOOLEAN DEFAULT false;
-- 2. Backfill from user_agent patterns
UPDATE user_sessions SET is_bot = true
WHERE user_agent ILIKE '%bot%'
OR user_agent ILIKE '%crawler%'
OR user_agent ILIKE '%spider%'
OR user_agent ILIKE '%curl/%'
OR user_agent ILIKE '%python-requests%'
OR user_agent ILIKE '%axios/%'
OR user_agent ILIKE '%wget/%'
OR user_agent ILIKE '%Scrapy%'
OR user_agent ILIKE '%Java/%'
OR user_agent ILIKE '%Go-http%'
OR user_agent ILIKE '%Werkzeug%'
OR user_agent ILIKE '%LeakIx%'
OR user_agent ILIKE '%Nuclei%'
OR user_agent ILIKE '%masscan%'
OR user_agent ILIKE '%nmap%'
OR user_agent ILIKE '%zgrab%'
OR user_agent ILIKE '%httpx%'
OR user_agent ILIKE '%censys%'
OR user_agent ILIKE '%shodan%'
OR user_agent IS NULL;
-- 3. Partial index for non-bot sessions (most queries filter on this)
CREATE INDEX IF NOT EXISTS idx_us_is_bot ON user_sessions(is_bot) WHERE is_bot = false;
-- 4. Update analytics_daily trigger to skip bot sessions
CREATE OR REPLACE FUNCTION update_analytics_daily()
RETURNS TRIGGER AS $$
DECLARE target_date DATE;
BEGIN
IF TG_TABLE_NAME = 'user_sessions' THEN
IF NEW.is_bot = true THEN RETURN NEW; END IF;
target_date := DATE(NEW.started_at);
ELSIF TG_TABLE_NAME = 'page_views' THEN
IF NEW.session_id IS NOT NULL THEN
IF EXISTS (SELECT 1 FROM user_sessions WHERE id = NEW.session_id AND is_bot = true) THEN
RETURN NEW;
END IF;
END IF;
target_date := DATE(NEW.viewed_at);
ELSE RETURN NEW;
END IF;
INSERT INTO analytics_daily (date, total_sessions, total_page_views, updated_at)
VALUES (target_date, 0, 0, NOW()) ON CONFLICT (date) DO NOTHING;
IF TG_TABLE_NAME = 'user_sessions' THEN
UPDATE analytics_daily SET
total_sessions = total_sessions + 1,
unique_users = (SELECT COUNT(DISTINCT user_id) FROM user_sessions
WHERE DATE(started_at) = target_date AND user_id IS NOT NULL AND is_bot = false),
anonymous_sessions = (SELECT COUNT(*) FROM user_sessions
WHERE DATE(started_at) = target_date AND user_id IS NULL AND is_bot = false),
desktop_sessions = (SELECT COUNT(*) FROM user_sessions
WHERE DATE(started_at) = target_date AND device_type = 'desktop' AND is_bot = false),
mobile_sessions = (SELECT COUNT(*) FROM user_sessions
WHERE DATE(started_at) = target_date AND device_type = 'mobile' AND is_bot = false),
tablet_sessions = (SELECT COUNT(*) FROM user_sessions
WHERE DATE(started_at) = target_date AND device_type = 'tablet' AND is_bot = false),
updated_at = NOW()
WHERE date = target_date;
ELSIF TG_TABLE_NAME = 'page_views' THEN
UPDATE analytics_daily SET total_page_views = total_page_views + 1, updated_at = NOW()
WHERE date = target_date;
END IF;
RETURN NEW;
END;
$$ LANGUAGE plpgsql;
-- 5. Recalculate analytics_daily for last 90 days (remove bot contamination)
UPDATE analytics_daily ad SET
total_sessions = (SELECT COUNT(*) FROM user_sessions
WHERE DATE(started_at) = ad.date AND is_bot = false),
total_page_views = (SELECT COUNT(*) FROM page_views pv
JOIN user_sessions us ON pv.session_id = us.id
WHERE DATE(pv.viewed_at) = ad.date AND us.is_bot = false),
unique_users = (SELECT COUNT(DISTINCT user_id) FROM user_sessions
WHERE DATE(started_at) = ad.date AND user_id IS NOT NULL AND is_bot = false),
anonymous_sessions = (SELECT COUNT(*) FROM user_sessions
WHERE DATE(started_at) = ad.date AND user_id IS NULL AND is_bot = false),
desktop_sessions = (SELECT COUNT(*) FROM user_sessions
WHERE DATE(started_at) = ad.date AND device_type = 'desktop' AND is_bot = false),
mobile_sessions = (SELECT COUNT(*) FROM user_sessions
WHERE DATE(started_at) = ad.date AND device_type = 'mobile' AND is_bot = false),
tablet_sessions = (SELECT COUNT(*) FROM user_sessions
WHERE DATE(started_at) = ad.date AND device_type = 'tablet' AND is_bot = false)
WHERE ad.date >= CURRENT_DATE - 90;
-- 6. Grants
GRANT ALL ON TABLE user_sessions TO nordabiz_app;

View File

@ -54,12 +54,16 @@ def get_or_create_analytics_session():
browser_version = ua.browser.version_string browser_version = ua.browser.version_string
os_name = ua.os.family os_name = ua.os.family
os_version = ua.os.version_string os_version = ua.os.version_string
is_bot = ua.is_bot or any(p in ua_string.lower() for p in
['curl/', 'python-requests', 'axios/', 'wget/', 'scrapy',
'werkzeug', 'leakix', 'nuclei', 'masscan', 'zgrab', 'httpx'])
except Exception: except Exception:
device_type = 'desktop' device_type = 'desktop'
browser = 'Unknown' browser = 'Unknown'
browser_version = '' browser_version = ''
os_name = 'Unknown' os_name = 'Unknown'
os_version = '' os_version = ''
is_bot = False
user_session = UserSession( user_session = UserSession(
session_id=analytics_session_id, session_id=analytics_session_id,
@ -70,7 +74,8 @@ def get_or_create_analytics_session():
browser=browser[:50] if browser else None, browser=browser[:50] if browser else None,
browser_version=browser_version[:20] if browser_version else None, browser_version=browser_version[:20] if browser_version else None,
os=os_name[:50] if os_name else None, os=os_name[:50] if os_name else None,
os_version=os_version[:20] if os_version else None os_version=os_version[:20] if os_version else None,
is_bot=is_bot
) )
db.add(user_session) db.add(user_session)
db.commit() db.commit()

View File

@ -77,6 +77,13 @@ def register_middleware(app):
if request.path == '/favicon.ico': if request.path == '/favicon.ico':
return return
# Skip bot/AJAX utility paths
skip_exact = {'/robots.txt', '/sitemap.xml', '/manifest.json',
'/check-verification-status', '/resend-verification'}
skip_prefixes = ('/.well-known/',)
if request.path in skip_exact or any(request.path.startswith(p) for p in skip_prefixes):
return
try: try:
from utils.analytics import ( from utils.analytics import (
track_page_view_for_request, track_page_view_for_request,