fix: filter bots from analytics, use audit_logs for failed logins, logarithmic engagement score
Some checks are pending
NordaBiz Tests / Unit & Integration Tests (push) Waiting to run
NordaBiz Tests / E2E Tests (Playwright) (push) Blocked by required conditions
NordaBiz Tests / Smoke Tests (Production) (push) Blocked by required conditions
NordaBiz Tests / Send Failure Notification (push) Blocked by required conditions
Some checks are pending
NordaBiz Tests / Unit & Integration Tests (push) Waiting to run
NordaBiz Tests / E2E Tests (Playwright) (push) Blocked by required conditions
NordaBiz Tests / Smoke Tests (Production) (push) Blocked by required conditions
NordaBiz Tests / Send Failure Notification (push) Blocked by required conditions
- Add is_bot column to user_sessions with backfill from user_agent patterns - Update analytics_daily trigger to skip bot sessions - Recalculate 90 days of analytics_daily without bot contamination - Replace cumulative failed_login_attempts with time-based audit_logs queries - Switch engagement score from linear (capped at 100) to log2 scale - Expand section_map from 9 to 17 categories (~95% traffic coverage) - Exclude robots.txt, sitemap.xml etc from page view tracking - Add bot filter to all overview, pages, paths, and engagement queries Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
fc9d979fea
commit
cca52301a6
@ -9,6 +9,7 @@ page popularity, user flows, and behavioral profiles.
|
||||
import csv
|
||||
import io
|
||||
import logging
|
||||
import math
|
||||
from datetime import date, timedelta, datetime
|
||||
|
||||
from flask import render_template, request, redirect, url_for, flash, Response
|
||||
@ -27,6 +28,21 @@ from utils.decorators import role_required
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _non_bot_sessions(db, start_dt=None):
|
||||
"""Subquery of non-bot session IDs for filtering page_views."""
|
||||
q = db.query(UserSession.id).filter(UserSession.is_bot == False)
|
||||
if start_dt:
|
||||
q = q.filter(UserSession.started_at >= start_dt)
|
||||
return q
|
||||
|
||||
|
||||
def _log_engagement_score(raw):
|
||||
"""Logarithmic engagement score: better distribution than linear capped at 100."""
|
||||
if raw <= 0:
|
||||
return 0
|
||||
return min(100, int(math.log2(raw + 1) * 6))
|
||||
|
||||
|
||||
def _get_period_dates(period):
|
||||
"""Return (start_date, days) for given period string."""
|
||||
today = date.today()
|
||||
@ -95,11 +111,9 @@ def _tab_problems(db, start_date, days):
|
||||
User.locked_until > now, User.is_active == True
|
||||
).scalar() or 0
|
||||
|
||||
failed_logins_7d = db.query(
|
||||
func.coalesce(func.sum(User.failed_login_attempts), 0)
|
||||
).filter(
|
||||
User.is_active == True,
|
||||
User.failed_login_attempts > 0
|
||||
failed_logins_7d = db.query(func.count(AuditLog.id)).filter(
|
||||
AuditLog.action == 'login_failed',
|
||||
AuditLog.created_at >= start_dt
|
||||
).scalar() or 0
|
||||
|
||||
password_resets_7d = db.query(func.count(EmailLog.id)).filter(
|
||||
@ -116,8 +130,12 @@ def _tab_problems(db, start_date, days):
|
||||
problem_users = []
|
||||
|
||||
for user in users:
|
||||
# Failed logins
|
||||
fl = user.failed_login_attempts or 0
|
||||
# Failed logins (from audit_logs, time-based)
|
||||
fl = db.query(func.count(AuditLog.id)).filter(
|
||||
AuditLog.user_email == user.email,
|
||||
AuditLog.action == 'login_failed',
|
||||
AuditLog.created_at >= start_dt
|
||||
).scalar() or 0
|
||||
|
||||
# Security alerts 7d
|
||||
sa_7d = db.query(func.count(SecurityAlert.id)).filter(
|
||||
@ -292,7 +310,8 @@ def _tab_engagement(db, start_date, days):
|
||||
# Stat cards
|
||||
active_7d = db.query(func.count(func.distinct(UserSession.user_id))).filter(
|
||||
UserSession.user_id.isnot(None),
|
||||
UserSession.started_at >= start_dt
|
||||
UserSession.started_at >= start_dt,
|
||||
UserSession.is_bot == False
|
||||
).scalar() or 0
|
||||
|
||||
all_users = db.query(User).filter(User.is_active == True).all()
|
||||
@ -321,49 +340,57 @@ def _tab_engagement(db, start_date, days):
|
||||
|
||||
engagement_list = []
|
||||
for user in registered_users:
|
||||
# Current period
|
||||
# Current period (exclude bots)
|
||||
sessions_cur = db.query(func.count(UserSession.id)).filter(
|
||||
UserSession.user_id == user.id,
|
||||
UserSession.started_at >= start_dt
|
||||
UserSession.started_at >= start_dt,
|
||||
UserSession.is_bot == False
|
||||
).scalar() or 0
|
||||
|
||||
pv_cur = db.query(func.count(PageView.id)).filter(
|
||||
PageView.user_id == user.id,
|
||||
PageView.viewed_at >= start_dt
|
||||
PageView.viewed_at >= start_dt,
|
||||
PageView.session_id.in_(_non_bot_sessions(db, start_dt))
|
||||
).scalar() or 0
|
||||
|
||||
# Previous period for WoW
|
||||
sessions_prev = db.query(func.count(UserSession.id)).filter(
|
||||
UserSession.user_id == user.id,
|
||||
UserSession.started_at >= prev_start,
|
||||
UserSession.started_at < start_dt
|
||||
UserSession.started_at < start_dt,
|
||||
UserSession.is_bot == False
|
||||
).scalar() or 0
|
||||
|
||||
pv_prev = db.query(func.count(PageView.id)).filter(
|
||||
PageView.user_id == user.id,
|
||||
PageView.viewed_at >= prev_start,
|
||||
PageView.viewed_at < start_dt
|
||||
PageView.viewed_at < start_dt,
|
||||
PageView.session_id.in_(_non_bot_sessions(db, prev_start))
|
||||
).scalar() or 0
|
||||
|
||||
# 30d engagement score components
|
||||
# 30d engagement score components (exclude bots)
|
||||
s30 = db.query(func.count(UserSession.id)).filter(
|
||||
UserSession.user_id == user.id,
|
||||
UserSession.started_at >= start_30d
|
||||
UserSession.started_at >= start_30d,
|
||||
UserSession.is_bot == False
|
||||
).scalar() or 0
|
||||
|
||||
pv30 = db.query(func.count(PageView.id)).filter(
|
||||
PageView.user_id == user.id,
|
||||
PageView.viewed_at >= start_30d
|
||||
PageView.viewed_at >= start_30d,
|
||||
PageView.session_id.in_(_non_bot_sessions(db, start_30d))
|
||||
).scalar() or 0
|
||||
|
||||
clicks30 = db.query(func.sum(UserSession.clicks_count)).filter(
|
||||
UserSession.user_id == user.id,
|
||||
UserSession.started_at >= start_30d
|
||||
UserSession.started_at >= start_30d,
|
||||
UserSession.is_bot == False
|
||||
).scalar() or 0
|
||||
|
||||
dur30 = db.query(func.sum(UserSession.duration_seconds)).filter(
|
||||
UserSession.user_id == user.id,
|
||||
UserSession.started_at >= start_30d
|
||||
UserSession.started_at >= start_30d,
|
||||
UserSession.is_bot == False
|
||||
).scalar() or 0
|
||||
|
||||
conv30 = db.query(func.count(ConversionEvent.id)).filter(
|
||||
@ -376,11 +403,9 @@ def _tab_engagement(db, start_date, days):
|
||||
SearchQuery.searched_at >= start_30d
|
||||
).scalar() or 0
|
||||
|
||||
score = min(100,
|
||||
s30 * 3 + pv30 * 1 + int(clicks30) * 0.5 +
|
||||
int(dur30) / 60 * 2 + conv30 * 10 + search30 * 2
|
||||
)
|
||||
score = int(score)
|
||||
raw = (s30 * 3 + pv30 * 1 + int(clicks30) * 0.5 +
|
||||
int(dur30) / 60 * 2 + conv30 * 10 + search30 * 2)
|
||||
score = _log_engagement_score(raw)
|
||||
|
||||
# WoW change
|
||||
wow = None
|
||||
@ -444,16 +469,24 @@ def _tab_pages(db, start_date, days):
|
||||
"""Page popularity map."""
|
||||
start_dt = datetime.combine(start_date, datetime.min.time())
|
||||
|
||||
# Page sections with grouping
|
||||
# Page sections with grouping (expanded to cover ~95% of traffic)
|
||||
section_map = {
|
||||
'Strona główna': ['/'],
|
||||
'Profile firm': ['/company/'],
|
||||
'Forum': ['/forum'],
|
||||
'Chat': ['/chat'],
|
||||
'Wyszukiwarka': ['/search', '/szukaj'],
|
||||
'Wydarzenia': ['/events', '/wydarzenia'],
|
||||
'Ogłoszenia': ['/classifieds', '/ogloszenia'],
|
||||
'Członkostwo': ['/membership', '/czlonkostwo'],
|
||||
'Wydarzenia': ['/events', '/wydarzenia', '/kalendarz'],
|
||||
'Ogłoszenia': ['/classifieds', '/ogloszenia', '/tablica'],
|
||||
'Członkostwo': ['/membership', '/czlonkostwo', '/korzysci'],
|
||||
'Logowanie': ['/login', '/register', '/forgot-password', '/reset-password', '/verify-email'],
|
||||
'Panel użytkownika': ['/dashboard', '/konto'],
|
||||
'Wiadomości': ['/wiadomosci'],
|
||||
'Edukacja': ['/edukacja'],
|
||||
'Rada': ['/rada'],
|
||||
'ZOPK': ['/zopk'],
|
||||
'Kontakty': ['/kontakty'],
|
||||
'Raporty': ['/raporty'],
|
||||
'Admin': ['/admin'],
|
||||
}
|
||||
|
||||
@ -467,9 +500,10 @@ def _tab_pages(db, start_date, days):
|
||||
func.count(PageView.id).label('views'),
|
||||
func.count(func.distinct(PageView.user_id)).label('unique_users'),
|
||||
func.avg(PageView.time_on_page_seconds).label('avg_time')
|
||||
).filter(
|
||||
).join(UserSession, PageView.session_id == UserSession.id).filter(
|
||||
or_(*conditions),
|
||||
PageView.viewed_at >= start_dt
|
||||
PageView.viewed_at >= start_dt,
|
||||
UserSession.is_bot == False
|
||||
).first()
|
||||
|
||||
sections.append({
|
||||
@ -484,7 +518,7 @@ def _tab_pages(db, start_date, days):
|
||||
for s in sections:
|
||||
s['intensity'] = min(100, int(s['views'] / max_views * 100))
|
||||
|
||||
# Top 50 pages
|
||||
# Top 50 pages (exclude bots)
|
||||
top_pages = db.query(
|
||||
PageView.path,
|
||||
func.count(PageView.id).label('views'),
|
||||
@ -492,8 +526,9 @@ def _tab_pages(db, start_date, days):
|
||||
func.avg(PageView.time_on_page_seconds).label('avg_time'),
|
||||
func.avg(PageView.scroll_depth_percent).label('avg_scroll'),
|
||||
func.avg(PageView.load_time_ms).label('avg_load'),
|
||||
).filter(
|
||||
PageView.viewed_at >= start_dt
|
||||
).join(UserSession, PageView.session_id == UserSession.id).filter(
|
||||
PageView.viewed_at >= start_dt,
|
||||
UserSession.is_bot == False
|
||||
).group_by(PageView.path).order_by(desc('views')).limit(50).all()
|
||||
|
||||
max_page_views = top_pages[0].views if top_pages else 1
|
||||
@ -510,13 +545,14 @@ def _tab_pages(db, start_date, days):
|
||||
'bar_pct': int(p.views / max_page_views * 100),
|
||||
})
|
||||
|
||||
# Ignored pages (< 5 views in 30d)
|
||||
# Ignored pages (< 5 views in 30d, exclude bots)
|
||||
start_30d = datetime.combine(date.today() - timedelta(days=30), datetime.min.time())
|
||||
ignored = db.query(
|
||||
PageView.path,
|
||||
func.count(PageView.id).label('views'),
|
||||
).filter(
|
||||
PageView.viewed_at >= start_30d
|
||||
).join(UserSession, PageView.session_id == UserSession.id).filter(
|
||||
PageView.viewed_at >= start_30d,
|
||||
UserSession.is_bot == False
|
||||
).group_by(PageView.path).having(
|
||||
func.count(PageView.id) < 5
|
||||
).order_by('views').limit(30).all()
|
||||
@ -536,13 +572,14 @@ def _tab_paths(db, start_date, days):
|
||||
"""User flow analysis."""
|
||||
start_dt = datetime.combine(start_date, datetime.min.time())
|
||||
|
||||
# Entry pages - first page in each session
|
||||
# Entry pages - first page in each session (exclude bots)
|
||||
entry_sql = text("""
|
||||
WITH first_pages AS (
|
||||
SELECT DISTINCT ON (session_id) path
|
||||
FROM page_views
|
||||
WHERE viewed_at >= :start_dt AND session_id IS NOT NULL
|
||||
ORDER BY session_id, viewed_at ASC
|
||||
SELECT DISTINCT ON (pv.session_id) pv.path
|
||||
FROM page_views pv
|
||||
JOIN user_sessions us ON pv.session_id = us.id
|
||||
WHERE pv.viewed_at >= :start_dt AND pv.session_id IS NOT NULL AND us.is_bot = false
|
||||
ORDER BY pv.session_id, pv.viewed_at ASC
|
||||
)
|
||||
SELECT path, COUNT(*) as cnt
|
||||
FROM first_pages
|
||||
@ -550,13 +587,14 @@ def _tab_paths(db, start_date, days):
|
||||
""")
|
||||
entry_pages = db.execute(entry_sql, {'start_dt': start_dt}).fetchall()
|
||||
|
||||
# Exit pages - last page in each session
|
||||
# Exit pages - last page in each session (exclude bots)
|
||||
exit_sql = text("""
|
||||
WITH last_pages AS (
|
||||
SELECT DISTINCT ON (session_id) path
|
||||
FROM page_views
|
||||
WHERE viewed_at >= :start_dt AND session_id IS NOT NULL
|
||||
ORDER BY session_id, viewed_at DESC
|
||||
SELECT DISTINCT ON (pv.session_id) pv.path
|
||||
FROM page_views pv
|
||||
JOIN user_sessions us ON pv.session_id = us.id
|
||||
WHERE pv.viewed_at >= :start_dt AND pv.session_id IS NOT NULL AND us.is_bot = false
|
||||
ORDER BY pv.session_id, pv.viewed_at DESC
|
||||
)
|
||||
SELECT path, COUNT(*) as cnt
|
||||
FROM last_pages
|
||||
@ -567,13 +605,14 @@ def _tab_paths(db, start_date, days):
|
||||
max_entry = entry_pages[0].cnt if entry_pages else 1
|
||||
max_exit = exit_pages[0].cnt if exit_pages else 1
|
||||
|
||||
# Top transitions
|
||||
# Top transitions (exclude bots)
|
||||
transitions_sql = text("""
|
||||
WITH ordered AS (
|
||||
SELECT session_id, path,
|
||||
LEAD(path) OVER (PARTITION BY session_id ORDER BY viewed_at) AS next_path
|
||||
FROM page_views
|
||||
WHERE viewed_at >= :start_dt AND session_id IS NOT NULL
|
||||
SELECT pv.session_id, pv.path,
|
||||
LEAD(pv.path) OVER (PARTITION BY pv.session_id ORDER BY pv.viewed_at) AS next_path
|
||||
FROM page_views pv
|
||||
JOIN user_sessions us ON pv.session_id = us.id
|
||||
WHERE pv.viewed_at >= :start_dt AND pv.session_id IS NOT NULL AND us.is_bot = false
|
||||
)
|
||||
SELECT path, next_path, COUNT(*) as cnt
|
||||
FROM ordered
|
||||
@ -582,21 +621,23 @@ def _tab_paths(db, start_date, days):
|
||||
""")
|
||||
transitions = db.execute(transitions_sql, {'start_dt': start_dt}).fetchall()
|
||||
|
||||
# Drop-off pages (high exit rate)
|
||||
# Drop-off pages (high exit rate, exclude bots)
|
||||
dropoff_sql = text("""
|
||||
WITH page_stats AS (
|
||||
SELECT path, COUNT(*) as total_views
|
||||
FROM page_views
|
||||
WHERE viewed_at >= :start_dt AND session_id IS NOT NULL
|
||||
GROUP BY path HAVING COUNT(*) >= 5
|
||||
SELECT pv.path, COUNT(*) as total_views
|
||||
FROM page_views pv
|
||||
JOIN user_sessions us ON pv.session_id = us.id
|
||||
WHERE pv.viewed_at >= :start_dt AND pv.session_id IS NOT NULL AND us.is_bot = false
|
||||
GROUP BY pv.path HAVING COUNT(*) >= 5
|
||||
),
|
||||
exit_stats AS (
|
||||
SELECT path, COUNT(*) as exit_count
|
||||
FROM (
|
||||
SELECT DISTINCT ON (session_id) path
|
||||
FROM page_views
|
||||
WHERE viewed_at >= :start_dt AND session_id IS NOT NULL
|
||||
ORDER BY session_id, viewed_at DESC
|
||||
SELECT DISTINCT ON (pv.session_id) pv.path
|
||||
FROM page_views pv
|
||||
JOIN user_sessions us ON pv.session_id = us.id
|
||||
WHERE pv.viewed_at >= :start_dt AND pv.session_id IS NOT NULL AND us.is_bot = false
|
||||
ORDER BY pv.session_id, pv.viewed_at DESC
|
||||
) lp
|
||||
GROUP BY path
|
||||
)
|
||||
@ -609,7 +650,7 @@ def _tab_paths(db, start_date, days):
|
||||
""")
|
||||
dropoff = db.execute(dropoff_sql, {'start_dt': start_dt}).fetchall()
|
||||
|
||||
# Session length distribution
|
||||
# Session length distribution (exclude bots)
|
||||
session_length_sql = text("""
|
||||
SELECT
|
||||
CASE
|
||||
@ -621,10 +662,11 @@ def _tab_paths(db, start_date, days):
|
||||
END as bucket,
|
||||
COUNT(*) as cnt
|
||||
FROM (
|
||||
SELECT session_id, COUNT(*) as pv_count
|
||||
FROM page_views
|
||||
WHERE viewed_at >= :start_dt AND session_id IS NOT NULL
|
||||
GROUP BY session_id
|
||||
SELECT pv.session_id, COUNT(*) as pv_count
|
||||
FROM page_views pv
|
||||
JOIN user_sessions us ON pv.session_id = us.id
|
||||
WHERE pv.viewed_at >= :start_dt AND pv.session_id IS NOT NULL AND us.is_bot = false
|
||||
GROUP BY pv.session_id
|
||||
) session_counts
|
||||
GROUP BY bucket
|
||||
ORDER BY MIN(pv_count)
|
||||
@ -651,14 +693,13 @@ def _tab_overview(db, start_date, days):
|
||||
start_dt = datetime.combine(start_date, datetime.min.time())
|
||||
start_30d = datetime.combine(date.today() - timedelta(days=30), datetime.min.time())
|
||||
|
||||
# Daily sessions + page views (30d)
|
||||
# Daily sessions from analytics_daily (already bot-filtered after migration)
|
||||
daily_data = db.query(AnalyticsDaily).filter(
|
||||
AnalyticsDaily.date >= date.today() - timedelta(days=30)
|
||||
).order_by(AnalyticsDaily.date).all()
|
||||
|
||||
chart_labels = []
|
||||
chart_sessions = []
|
||||
chart_pageviews = []
|
||||
for d in daily_data:
|
||||
chart_labels.append(d.date.strftime('%d.%m'))
|
||||
if filter_type == 'logged':
|
||||
@ -667,15 +708,36 @@ def _tab_overview(db, start_date, days):
|
||||
chart_sessions.append(d.anonymous_sessions or 0)
|
||||
else:
|
||||
chart_sessions.append(d.total_sessions or 0)
|
||||
chart_pageviews.append(d.total_page_views or 0)
|
||||
|
||||
# Hourly heatmap (7 days x 24 hours)
|
||||
# Daily page views from raw PageView + JOIN (bot-filtered, supports logged/anon filter)
|
||||
pv_filter = [
|
||||
PageView.viewed_at >= start_30d,
|
||||
UserSession.is_bot == False,
|
||||
]
|
||||
if filter_type == 'logged':
|
||||
pv_filter.append(UserSession.user_id.isnot(None))
|
||||
elif filter_type == 'anonymous':
|
||||
pv_filter.append(UserSession.user_id.is_(None))
|
||||
|
||||
pv_daily = db.query(
|
||||
func.date(PageView.viewed_at).label('day'),
|
||||
func.count(PageView.id).label('cnt')
|
||||
).join(UserSession, PageView.session_id == UserSession.id).filter(
|
||||
*pv_filter
|
||||
).group_by(func.date(PageView.viewed_at)).all()
|
||||
|
||||
pv_by_date = {str(r.day): r.cnt for r in pv_daily}
|
||||
chart_pageviews = []
|
||||
for d in daily_data:
|
||||
chart_pageviews.append(pv_by_date.get(str(d.date), 0))
|
||||
|
||||
# Hourly heatmap (7 days x 24 hours, exclude bots)
|
||||
heatmap_sql = text("""
|
||||
SELECT EXTRACT(DOW FROM started_at)::int as dow,
|
||||
EXTRACT(HOUR FROM started_at)::int as hour,
|
||||
COUNT(*) as cnt
|
||||
FROM user_sessions
|
||||
WHERE started_at >= :start_dt
|
||||
WHERE started_at >= :start_dt AND is_bot = false
|
||||
GROUP BY dow, hour
|
||||
""")
|
||||
heatmap_raw = db.execute(heatmap_sql, {'start_dt': start_30d}).fetchall()
|
||||
@ -697,23 +759,25 @@ def _tab_overview(db, start_date, days):
|
||||
row['hours'].append({'count': cnt, 'intensity': intensity})
|
||||
heatmap_grid.append(row)
|
||||
|
||||
# Logged vs Anonymous
|
||||
# Logged vs Anonymous (exclude bots)
|
||||
total_logged = db.query(func.count(UserSession.id)).filter(
|
||||
UserSession.started_at >= start_30d,
|
||||
UserSession.user_id.isnot(None)
|
||||
UserSession.user_id.isnot(None),
|
||||
UserSession.is_bot == False
|
||||
).scalar() or 0
|
||||
total_anon = db.query(func.count(UserSession.id)).filter(
|
||||
UserSession.started_at >= start_30d,
|
||||
UserSession.user_id.is_(None)
|
||||
UserSession.user_id.is_(None),
|
||||
UserSession.is_bot == False
|
||||
).scalar() or 0
|
||||
|
||||
# Devices over time (weekly)
|
||||
# Devices over time (weekly, exclude bots)
|
||||
devices_sql = text("""
|
||||
SELECT DATE_TRUNC('week', started_at)::date as week,
|
||||
device_type,
|
||||
COUNT(*) as cnt
|
||||
FROM user_sessions
|
||||
WHERE started_at >= :start_dt
|
||||
WHERE started_at >= :start_dt AND is_bot = false
|
||||
GROUP BY week, device_type
|
||||
ORDER BY week
|
||||
""")
|
||||
@ -793,13 +857,16 @@ def user_insights_profile(user_id):
|
||||
SearchQuery.user_id == user_id, SearchQuery.searched_at >= start_30d
|
||||
).scalar() or 0
|
||||
|
||||
engagement_score = min(100, int(
|
||||
s30 * 3 + pv30 * 1 + int(clicks30) * 0.5 +
|
||||
int(dur30) / 60 * 2 + conv30 * 10 + search30 * 2
|
||||
))
|
||||
raw = (s30 * 3 + pv30 * 1 + int(clicks30) * 0.5 +
|
||||
int(dur30) / 60 * 2 + conv30 * 10 + search30 * 2)
|
||||
engagement_score = _log_engagement_score(raw)
|
||||
|
||||
# Problem score
|
||||
fl = user.failed_login_attempts or 0
|
||||
# Problem score (failed logins from audit_logs, time-based)
|
||||
fl = db.query(func.count(AuditLog.id)).filter(
|
||||
AuditLog.user_email == user.email,
|
||||
AuditLog.action == 'login_failed',
|
||||
AuditLog.created_at >= start_7d
|
||||
).scalar() or 0
|
||||
sa_7d = db.query(func.count(SecurityAlert.id)).filter(
|
||||
SecurityAlert.user_email == user.email,
|
||||
SecurityAlert.created_at >= start_7d
|
||||
@ -1166,7 +1233,7 @@ def user_insights_profile(user_id):
|
||||
PageView.viewed_at < d_end
|
||||
).scalar() or 0
|
||||
|
||||
daily_score = min(30, d_sessions * 3 + d_pv)
|
||||
daily_score = _log_engagement_score(d_sessions * 3 + d_pv)
|
||||
trend_labels.append(d.strftime('%d.%m'))
|
||||
trend_scores.append(daily_score)
|
||||
|
||||
|
||||
@ -4144,6 +4144,9 @@ class UserSession(Base):
|
||||
page_views_count = Column(Integer, default=0)
|
||||
clicks_count = Column(Integer, default=0)
|
||||
|
||||
# Bot detection
|
||||
is_bot = Column(Boolean, default=False)
|
||||
|
||||
# UTM Parameters (kampanie marketingowe)
|
||||
utm_source = Column(String(255), nullable=True) # google, facebook, newsletter
|
||||
utm_medium = Column(String(255), nullable=True) # cpc, email, social, organic
|
||||
|
||||
98
database/migrations/079_bot_filtering.sql
Normal file
98
database/migrations/079_bot_filtering.sql
Normal file
@ -0,0 +1,98 @@
|
||||
-- Migration 079: Bot Filtering for Analytics
|
||||
-- Adds is_bot column to user_sessions, backfills from user_agent patterns,
|
||||
-- updates analytics_daily trigger to exclude bots, recalculates 90 days of data.
|
||||
|
||||
-- 1. Add column
|
||||
ALTER TABLE user_sessions ADD COLUMN IF NOT EXISTS is_bot BOOLEAN DEFAULT false;
|
||||
|
||||
-- 2. Backfill from user_agent patterns
|
||||
UPDATE user_sessions SET is_bot = true
|
||||
WHERE user_agent ILIKE '%bot%'
|
||||
OR user_agent ILIKE '%crawler%'
|
||||
OR user_agent ILIKE '%spider%'
|
||||
OR user_agent ILIKE '%curl/%'
|
||||
OR user_agent ILIKE '%python-requests%'
|
||||
OR user_agent ILIKE '%axios/%'
|
||||
OR user_agent ILIKE '%wget/%'
|
||||
OR user_agent ILIKE '%Scrapy%'
|
||||
OR user_agent ILIKE '%Java/%'
|
||||
OR user_agent ILIKE '%Go-http%'
|
||||
OR user_agent ILIKE '%Werkzeug%'
|
||||
OR user_agent ILIKE '%LeakIx%'
|
||||
OR user_agent ILIKE '%Nuclei%'
|
||||
OR user_agent ILIKE '%masscan%'
|
||||
OR user_agent ILIKE '%nmap%'
|
||||
OR user_agent ILIKE '%zgrab%'
|
||||
OR user_agent ILIKE '%httpx%'
|
||||
OR user_agent ILIKE '%censys%'
|
||||
OR user_agent ILIKE '%shodan%'
|
||||
OR user_agent IS NULL;
|
||||
|
||||
-- 3. Partial index for non-bot sessions (most queries filter on this)
|
||||
CREATE INDEX IF NOT EXISTS idx_us_is_bot ON user_sessions(is_bot) WHERE is_bot = false;
|
||||
|
||||
-- 4. Update analytics_daily trigger to skip bot sessions
|
||||
CREATE OR REPLACE FUNCTION update_analytics_daily()
|
||||
RETURNS TRIGGER AS $$
|
||||
DECLARE target_date DATE;
|
||||
BEGIN
|
||||
IF TG_TABLE_NAME = 'user_sessions' THEN
|
||||
IF NEW.is_bot = true THEN RETURN NEW; END IF;
|
||||
target_date := DATE(NEW.started_at);
|
||||
ELSIF TG_TABLE_NAME = 'page_views' THEN
|
||||
IF NEW.session_id IS NOT NULL THEN
|
||||
IF EXISTS (SELECT 1 FROM user_sessions WHERE id = NEW.session_id AND is_bot = true) THEN
|
||||
RETURN NEW;
|
||||
END IF;
|
||||
END IF;
|
||||
target_date := DATE(NEW.viewed_at);
|
||||
ELSE RETURN NEW;
|
||||
END IF;
|
||||
|
||||
INSERT INTO analytics_daily (date, total_sessions, total_page_views, updated_at)
|
||||
VALUES (target_date, 0, 0, NOW()) ON CONFLICT (date) DO NOTHING;
|
||||
|
||||
IF TG_TABLE_NAME = 'user_sessions' THEN
|
||||
UPDATE analytics_daily SET
|
||||
total_sessions = total_sessions + 1,
|
||||
unique_users = (SELECT COUNT(DISTINCT user_id) FROM user_sessions
|
||||
WHERE DATE(started_at) = target_date AND user_id IS NOT NULL AND is_bot = false),
|
||||
anonymous_sessions = (SELECT COUNT(*) FROM user_sessions
|
||||
WHERE DATE(started_at) = target_date AND user_id IS NULL AND is_bot = false),
|
||||
desktop_sessions = (SELECT COUNT(*) FROM user_sessions
|
||||
WHERE DATE(started_at) = target_date AND device_type = 'desktop' AND is_bot = false),
|
||||
mobile_sessions = (SELECT COUNT(*) FROM user_sessions
|
||||
WHERE DATE(started_at) = target_date AND device_type = 'mobile' AND is_bot = false),
|
||||
tablet_sessions = (SELECT COUNT(*) FROM user_sessions
|
||||
WHERE DATE(started_at) = target_date AND device_type = 'tablet' AND is_bot = false),
|
||||
updated_at = NOW()
|
||||
WHERE date = target_date;
|
||||
ELSIF TG_TABLE_NAME = 'page_views' THEN
|
||||
UPDATE analytics_daily SET total_page_views = total_page_views + 1, updated_at = NOW()
|
||||
WHERE date = target_date;
|
||||
END IF;
|
||||
RETURN NEW;
|
||||
END;
|
||||
$$ LANGUAGE plpgsql;
|
||||
|
||||
-- 5. Recalculate analytics_daily for last 90 days (remove bot contamination)
|
||||
UPDATE analytics_daily ad SET
|
||||
total_sessions = (SELECT COUNT(*) FROM user_sessions
|
||||
WHERE DATE(started_at) = ad.date AND is_bot = false),
|
||||
total_page_views = (SELECT COUNT(*) FROM page_views pv
|
||||
JOIN user_sessions us ON pv.session_id = us.id
|
||||
WHERE DATE(pv.viewed_at) = ad.date AND us.is_bot = false),
|
||||
unique_users = (SELECT COUNT(DISTINCT user_id) FROM user_sessions
|
||||
WHERE DATE(started_at) = ad.date AND user_id IS NOT NULL AND is_bot = false),
|
||||
anonymous_sessions = (SELECT COUNT(*) FROM user_sessions
|
||||
WHERE DATE(started_at) = ad.date AND user_id IS NULL AND is_bot = false),
|
||||
desktop_sessions = (SELECT COUNT(*) FROM user_sessions
|
||||
WHERE DATE(started_at) = ad.date AND device_type = 'desktop' AND is_bot = false),
|
||||
mobile_sessions = (SELECT COUNT(*) FROM user_sessions
|
||||
WHERE DATE(started_at) = ad.date AND device_type = 'mobile' AND is_bot = false),
|
||||
tablet_sessions = (SELECT COUNT(*) FROM user_sessions
|
||||
WHERE DATE(started_at) = ad.date AND device_type = 'tablet' AND is_bot = false)
|
||||
WHERE ad.date >= CURRENT_DATE - 90;
|
||||
|
||||
-- 6. Grants
|
||||
GRANT ALL ON TABLE user_sessions TO nordabiz_app;
|
||||
@ -54,12 +54,16 @@ def get_or_create_analytics_session():
|
||||
browser_version = ua.browser.version_string
|
||||
os_name = ua.os.family
|
||||
os_version = ua.os.version_string
|
||||
is_bot = ua.is_bot or any(p in ua_string.lower() for p in
|
||||
['curl/', 'python-requests', 'axios/', 'wget/', 'scrapy',
|
||||
'werkzeug', 'leakix', 'nuclei', 'masscan', 'zgrab', 'httpx'])
|
||||
except Exception:
|
||||
device_type = 'desktop'
|
||||
browser = 'Unknown'
|
||||
browser_version = ''
|
||||
os_name = 'Unknown'
|
||||
os_version = ''
|
||||
is_bot = False
|
||||
|
||||
user_session = UserSession(
|
||||
session_id=analytics_session_id,
|
||||
@ -70,7 +74,8 @@ def get_or_create_analytics_session():
|
||||
browser=browser[:50] if browser else None,
|
||||
browser_version=browser_version[:20] if browser_version else None,
|
||||
os=os_name[:50] if os_name else None,
|
||||
os_version=os_version[:20] if os_version else None
|
||||
os_version=os_version[:20] if os_version else None,
|
||||
is_bot=is_bot
|
||||
)
|
||||
db.add(user_session)
|
||||
db.commit()
|
||||
|
||||
@ -77,6 +77,13 @@ def register_middleware(app):
|
||||
if request.path == '/favicon.ico':
|
||||
return
|
||||
|
||||
# Skip bot/AJAX utility paths
|
||||
skip_exact = {'/robots.txt', '/sitemap.xml', '/manifest.json',
|
||||
'/check-verification-status', '/resend-verification'}
|
||||
skip_prefixes = ('/.well-known/',)
|
||||
if request.path in skip_exact or any(request.path.startswith(p) for p in skip_prefixes):
|
||||
return
|
||||
|
||||
try:
|
||||
from utils.analytics import (
|
||||
track_page_view_for_request,
|
||||
|
||||
Loading…
Reference in New Issue
Block a user