fix: filter bots from analytics, use audit_logs for failed logins, logarithmic engagement score
Some checks are pending
NordaBiz Tests / Unit & Integration Tests (push) Waiting to run
NordaBiz Tests / E2E Tests (Playwright) (push) Blocked by required conditions
NordaBiz Tests / Smoke Tests (Production) (push) Blocked by required conditions
NordaBiz Tests / Send Failure Notification (push) Blocked by required conditions
Some checks are pending
NordaBiz Tests / Unit & Integration Tests (push) Waiting to run
NordaBiz Tests / E2E Tests (Playwright) (push) Blocked by required conditions
NordaBiz Tests / Smoke Tests (Production) (push) Blocked by required conditions
NordaBiz Tests / Send Failure Notification (push) Blocked by required conditions
- Add is_bot column to user_sessions with backfill from user_agent patterns - Update analytics_daily trigger to skip bot sessions - Recalculate 90 days of analytics_daily without bot contamination - Replace cumulative failed_login_attempts with time-based audit_logs queries - Switch engagement score from linear (capped at 100) to log2 scale - Expand section_map from 9 to 17 categories (~95% traffic coverage) - Exclude robots.txt, sitemap.xml etc from page view tracking - Add bot filter to all overview, pages, paths, and engagement queries Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
fc9d979fea
commit
cca52301a6
@ -9,6 +9,7 @@ page popularity, user flows, and behavioral profiles.
|
|||||||
import csv
|
import csv
|
||||||
import io
|
import io
|
||||||
import logging
|
import logging
|
||||||
|
import math
|
||||||
from datetime import date, timedelta, datetime
|
from datetime import date, timedelta, datetime
|
||||||
|
|
||||||
from flask import render_template, request, redirect, url_for, flash, Response
|
from flask import render_template, request, redirect, url_for, flash, Response
|
||||||
@ -27,6 +28,21 @@ from utils.decorators import role_required
|
|||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
def _non_bot_sessions(db, start_dt=None):
|
||||||
|
"""Subquery of non-bot session IDs for filtering page_views."""
|
||||||
|
q = db.query(UserSession.id).filter(UserSession.is_bot == False)
|
||||||
|
if start_dt:
|
||||||
|
q = q.filter(UserSession.started_at >= start_dt)
|
||||||
|
return q
|
||||||
|
|
||||||
|
|
||||||
|
def _log_engagement_score(raw):
|
||||||
|
"""Logarithmic engagement score: better distribution than linear capped at 100."""
|
||||||
|
if raw <= 0:
|
||||||
|
return 0
|
||||||
|
return min(100, int(math.log2(raw + 1) * 6))
|
||||||
|
|
||||||
|
|
||||||
def _get_period_dates(period):
|
def _get_period_dates(period):
|
||||||
"""Return (start_date, days) for given period string."""
|
"""Return (start_date, days) for given period string."""
|
||||||
today = date.today()
|
today = date.today()
|
||||||
@ -95,11 +111,9 @@ def _tab_problems(db, start_date, days):
|
|||||||
User.locked_until > now, User.is_active == True
|
User.locked_until > now, User.is_active == True
|
||||||
).scalar() or 0
|
).scalar() or 0
|
||||||
|
|
||||||
failed_logins_7d = db.query(
|
failed_logins_7d = db.query(func.count(AuditLog.id)).filter(
|
||||||
func.coalesce(func.sum(User.failed_login_attempts), 0)
|
AuditLog.action == 'login_failed',
|
||||||
).filter(
|
AuditLog.created_at >= start_dt
|
||||||
User.is_active == True,
|
|
||||||
User.failed_login_attempts > 0
|
|
||||||
).scalar() or 0
|
).scalar() or 0
|
||||||
|
|
||||||
password_resets_7d = db.query(func.count(EmailLog.id)).filter(
|
password_resets_7d = db.query(func.count(EmailLog.id)).filter(
|
||||||
@ -116,8 +130,12 @@ def _tab_problems(db, start_date, days):
|
|||||||
problem_users = []
|
problem_users = []
|
||||||
|
|
||||||
for user in users:
|
for user in users:
|
||||||
# Failed logins
|
# Failed logins (from audit_logs, time-based)
|
||||||
fl = user.failed_login_attempts or 0
|
fl = db.query(func.count(AuditLog.id)).filter(
|
||||||
|
AuditLog.user_email == user.email,
|
||||||
|
AuditLog.action == 'login_failed',
|
||||||
|
AuditLog.created_at >= start_dt
|
||||||
|
).scalar() or 0
|
||||||
|
|
||||||
# Security alerts 7d
|
# Security alerts 7d
|
||||||
sa_7d = db.query(func.count(SecurityAlert.id)).filter(
|
sa_7d = db.query(func.count(SecurityAlert.id)).filter(
|
||||||
@ -292,7 +310,8 @@ def _tab_engagement(db, start_date, days):
|
|||||||
# Stat cards
|
# Stat cards
|
||||||
active_7d = db.query(func.count(func.distinct(UserSession.user_id))).filter(
|
active_7d = db.query(func.count(func.distinct(UserSession.user_id))).filter(
|
||||||
UserSession.user_id.isnot(None),
|
UserSession.user_id.isnot(None),
|
||||||
UserSession.started_at >= start_dt
|
UserSession.started_at >= start_dt,
|
||||||
|
UserSession.is_bot == False
|
||||||
).scalar() or 0
|
).scalar() or 0
|
||||||
|
|
||||||
all_users = db.query(User).filter(User.is_active == True).all()
|
all_users = db.query(User).filter(User.is_active == True).all()
|
||||||
@ -321,49 +340,57 @@ def _tab_engagement(db, start_date, days):
|
|||||||
|
|
||||||
engagement_list = []
|
engagement_list = []
|
||||||
for user in registered_users:
|
for user in registered_users:
|
||||||
# Current period
|
# Current period (exclude bots)
|
||||||
sessions_cur = db.query(func.count(UserSession.id)).filter(
|
sessions_cur = db.query(func.count(UserSession.id)).filter(
|
||||||
UserSession.user_id == user.id,
|
UserSession.user_id == user.id,
|
||||||
UserSession.started_at >= start_dt
|
UserSession.started_at >= start_dt,
|
||||||
|
UserSession.is_bot == False
|
||||||
).scalar() or 0
|
).scalar() or 0
|
||||||
|
|
||||||
pv_cur = db.query(func.count(PageView.id)).filter(
|
pv_cur = db.query(func.count(PageView.id)).filter(
|
||||||
PageView.user_id == user.id,
|
PageView.user_id == user.id,
|
||||||
PageView.viewed_at >= start_dt
|
PageView.viewed_at >= start_dt,
|
||||||
|
PageView.session_id.in_(_non_bot_sessions(db, start_dt))
|
||||||
).scalar() or 0
|
).scalar() or 0
|
||||||
|
|
||||||
# Previous period for WoW
|
# Previous period for WoW
|
||||||
sessions_prev = db.query(func.count(UserSession.id)).filter(
|
sessions_prev = db.query(func.count(UserSession.id)).filter(
|
||||||
UserSession.user_id == user.id,
|
UserSession.user_id == user.id,
|
||||||
UserSession.started_at >= prev_start,
|
UserSession.started_at >= prev_start,
|
||||||
UserSession.started_at < start_dt
|
UserSession.started_at < start_dt,
|
||||||
|
UserSession.is_bot == False
|
||||||
).scalar() or 0
|
).scalar() or 0
|
||||||
|
|
||||||
pv_prev = db.query(func.count(PageView.id)).filter(
|
pv_prev = db.query(func.count(PageView.id)).filter(
|
||||||
PageView.user_id == user.id,
|
PageView.user_id == user.id,
|
||||||
PageView.viewed_at >= prev_start,
|
PageView.viewed_at >= prev_start,
|
||||||
PageView.viewed_at < start_dt
|
PageView.viewed_at < start_dt,
|
||||||
|
PageView.session_id.in_(_non_bot_sessions(db, prev_start))
|
||||||
).scalar() or 0
|
).scalar() or 0
|
||||||
|
|
||||||
# 30d engagement score components
|
# 30d engagement score components (exclude bots)
|
||||||
s30 = db.query(func.count(UserSession.id)).filter(
|
s30 = db.query(func.count(UserSession.id)).filter(
|
||||||
UserSession.user_id == user.id,
|
UserSession.user_id == user.id,
|
||||||
UserSession.started_at >= start_30d
|
UserSession.started_at >= start_30d,
|
||||||
|
UserSession.is_bot == False
|
||||||
).scalar() or 0
|
).scalar() or 0
|
||||||
|
|
||||||
pv30 = db.query(func.count(PageView.id)).filter(
|
pv30 = db.query(func.count(PageView.id)).filter(
|
||||||
PageView.user_id == user.id,
|
PageView.user_id == user.id,
|
||||||
PageView.viewed_at >= start_30d
|
PageView.viewed_at >= start_30d,
|
||||||
|
PageView.session_id.in_(_non_bot_sessions(db, start_30d))
|
||||||
).scalar() or 0
|
).scalar() or 0
|
||||||
|
|
||||||
clicks30 = db.query(func.sum(UserSession.clicks_count)).filter(
|
clicks30 = db.query(func.sum(UserSession.clicks_count)).filter(
|
||||||
UserSession.user_id == user.id,
|
UserSession.user_id == user.id,
|
||||||
UserSession.started_at >= start_30d
|
UserSession.started_at >= start_30d,
|
||||||
|
UserSession.is_bot == False
|
||||||
).scalar() or 0
|
).scalar() or 0
|
||||||
|
|
||||||
dur30 = db.query(func.sum(UserSession.duration_seconds)).filter(
|
dur30 = db.query(func.sum(UserSession.duration_seconds)).filter(
|
||||||
UserSession.user_id == user.id,
|
UserSession.user_id == user.id,
|
||||||
UserSession.started_at >= start_30d
|
UserSession.started_at >= start_30d,
|
||||||
|
UserSession.is_bot == False
|
||||||
).scalar() or 0
|
).scalar() or 0
|
||||||
|
|
||||||
conv30 = db.query(func.count(ConversionEvent.id)).filter(
|
conv30 = db.query(func.count(ConversionEvent.id)).filter(
|
||||||
@ -376,11 +403,9 @@ def _tab_engagement(db, start_date, days):
|
|||||||
SearchQuery.searched_at >= start_30d
|
SearchQuery.searched_at >= start_30d
|
||||||
).scalar() or 0
|
).scalar() or 0
|
||||||
|
|
||||||
score = min(100,
|
raw = (s30 * 3 + pv30 * 1 + int(clicks30) * 0.5 +
|
||||||
s30 * 3 + pv30 * 1 + int(clicks30) * 0.5 +
|
int(dur30) / 60 * 2 + conv30 * 10 + search30 * 2)
|
||||||
int(dur30) / 60 * 2 + conv30 * 10 + search30 * 2
|
score = _log_engagement_score(raw)
|
||||||
)
|
|
||||||
score = int(score)
|
|
||||||
|
|
||||||
# WoW change
|
# WoW change
|
||||||
wow = None
|
wow = None
|
||||||
@ -444,16 +469,24 @@ def _tab_pages(db, start_date, days):
|
|||||||
"""Page popularity map."""
|
"""Page popularity map."""
|
||||||
start_dt = datetime.combine(start_date, datetime.min.time())
|
start_dt = datetime.combine(start_date, datetime.min.time())
|
||||||
|
|
||||||
# Page sections with grouping
|
# Page sections with grouping (expanded to cover ~95% of traffic)
|
||||||
section_map = {
|
section_map = {
|
||||||
'Strona główna': ['/'],
|
'Strona główna': ['/'],
|
||||||
'Profile firm': ['/company/'],
|
'Profile firm': ['/company/'],
|
||||||
'Forum': ['/forum'],
|
'Forum': ['/forum'],
|
||||||
'Chat': ['/chat'],
|
'Chat': ['/chat'],
|
||||||
'Wyszukiwarka': ['/search', '/szukaj'],
|
'Wyszukiwarka': ['/search', '/szukaj'],
|
||||||
'Wydarzenia': ['/events', '/wydarzenia'],
|
'Wydarzenia': ['/events', '/wydarzenia', '/kalendarz'],
|
||||||
'Ogłoszenia': ['/classifieds', '/ogloszenia'],
|
'Ogłoszenia': ['/classifieds', '/ogloszenia', '/tablica'],
|
||||||
'Członkostwo': ['/membership', '/czlonkostwo'],
|
'Członkostwo': ['/membership', '/czlonkostwo', '/korzysci'],
|
||||||
|
'Logowanie': ['/login', '/register', '/forgot-password', '/reset-password', '/verify-email'],
|
||||||
|
'Panel użytkownika': ['/dashboard', '/konto'],
|
||||||
|
'Wiadomości': ['/wiadomosci'],
|
||||||
|
'Edukacja': ['/edukacja'],
|
||||||
|
'Rada': ['/rada'],
|
||||||
|
'ZOPK': ['/zopk'],
|
||||||
|
'Kontakty': ['/kontakty'],
|
||||||
|
'Raporty': ['/raporty'],
|
||||||
'Admin': ['/admin'],
|
'Admin': ['/admin'],
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -467,9 +500,10 @@ def _tab_pages(db, start_date, days):
|
|||||||
func.count(PageView.id).label('views'),
|
func.count(PageView.id).label('views'),
|
||||||
func.count(func.distinct(PageView.user_id)).label('unique_users'),
|
func.count(func.distinct(PageView.user_id)).label('unique_users'),
|
||||||
func.avg(PageView.time_on_page_seconds).label('avg_time')
|
func.avg(PageView.time_on_page_seconds).label('avg_time')
|
||||||
).filter(
|
).join(UserSession, PageView.session_id == UserSession.id).filter(
|
||||||
or_(*conditions),
|
or_(*conditions),
|
||||||
PageView.viewed_at >= start_dt
|
PageView.viewed_at >= start_dt,
|
||||||
|
UserSession.is_bot == False
|
||||||
).first()
|
).first()
|
||||||
|
|
||||||
sections.append({
|
sections.append({
|
||||||
@ -484,7 +518,7 @@ def _tab_pages(db, start_date, days):
|
|||||||
for s in sections:
|
for s in sections:
|
||||||
s['intensity'] = min(100, int(s['views'] / max_views * 100))
|
s['intensity'] = min(100, int(s['views'] / max_views * 100))
|
||||||
|
|
||||||
# Top 50 pages
|
# Top 50 pages (exclude bots)
|
||||||
top_pages = db.query(
|
top_pages = db.query(
|
||||||
PageView.path,
|
PageView.path,
|
||||||
func.count(PageView.id).label('views'),
|
func.count(PageView.id).label('views'),
|
||||||
@ -492,8 +526,9 @@ def _tab_pages(db, start_date, days):
|
|||||||
func.avg(PageView.time_on_page_seconds).label('avg_time'),
|
func.avg(PageView.time_on_page_seconds).label('avg_time'),
|
||||||
func.avg(PageView.scroll_depth_percent).label('avg_scroll'),
|
func.avg(PageView.scroll_depth_percent).label('avg_scroll'),
|
||||||
func.avg(PageView.load_time_ms).label('avg_load'),
|
func.avg(PageView.load_time_ms).label('avg_load'),
|
||||||
).filter(
|
).join(UserSession, PageView.session_id == UserSession.id).filter(
|
||||||
PageView.viewed_at >= start_dt
|
PageView.viewed_at >= start_dt,
|
||||||
|
UserSession.is_bot == False
|
||||||
).group_by(PageView.path).order_by(desc('views')).limit(50).all()
|
).group_by(PageView.path).order_by(desc('views')).limit(50).all()
|
||||||
|
|
||||||
max_page_views = top_pages[0].views if top_pages else 1
|
max_page_views = top_pages[0].views if top_pages else 1
|
||||||
@ -510,13 +545,14 @@ def _tab_pages(db, start_date, days):
|
|||||||
'bar_pct': int(p.views / max_page_views * 100),
|
'bar_pct': int(p.views / max_page_views * 100),
|
||||||
})
|
})
|
||||||
|
|
||||||
# Ignored pages (< 5 views in 30d)
|
# Ignored pages (< 5 views in 30d, exclude bots)
|
||||||
start_30d = datetime.combine(date.today() - timedelta(days=30), datetime.min.time())
|
start_30d = datetime.combine(date.today() - timedelta(days=30), datetime.min.time())
|
||||||
ignored = db.query(
|
ignored = db.query(
|
||||||
PageView.path,
|
PageView.path,
|
||||||
func.count(PageView.id).label('views'),
|
func.count(PageView.id).label('views'),
|
||||||
).filter(
|
).join(UserSession, PageView.session_id == UserSession.id).filter(
|
||||||
PageView.viewed_at >= start_30d
|
PageView.viewed_at >= start_30d,
|
||||||
|
UserSession.is_bot == False
|
||||||
).group_by(PageView.path).having(
|
).group_by(PageView.path).having(
|
||||||
func.count(PageView.id) < 5
|
func.count(PageView.id) < 5
|
||||||
).order_by('views').limit(30).all()
|
).order_by('views').limit(30).all()
|
||||||
@ -536,13 +572,14 @@ def _tab_paths(db, start_date, days):
|
|||||||
"""User flow analysis."""
|
"""User flow analysis."""
|
||||||
start_dt = datetime.combine(start_date, datetime.min.time())
|
start_dt = datetime.combine(start_date, datetime.min.time())
|
||||||
|
|
||||||
# Entry pages - first page in each session
|
# Entry pages - first page in each session (exclude bots)
|
||||||
entry_sql = text("""
|
entry_sql = text("""
|
||||||
WITH first_pages AS (
|
WITH first_pages AS (
|
||||||
SELECT DISTINCT ON (session_id) path
|
SELECT DISTINCT ON (pv.session_id) pv.path
|
||||||
FROM page_views
|
FROM page_views pv
|
||||||
WHERE viewed_at >= :start_dt AND session_id IS NOT NULL
|
JOIN user_sessions us ON pv.session_id = us.id
|
||||||
ORDER BY session_id, viewed_at ASC
|
WHERE pv.viewed_at >= :start_dt AND pv.session_id IS NOT NULL AND us.is_bot = false
|
||||||
|
ORDER BY pv.session_id, pv.viewed_at ASC
|
||||||
)
|
)
|
||||||
SELECT path, COUNT(*) as cnt
|
SELECT path, COUNT(*) as cnt
|
||||||
FROM first_pages
|
FROM first_pages
|
||||||
@ -550,13 +587,14 @@ def _tab_paths(db, start_date, days):
|
|||||||
""")
|
""")
|
||||||
entry_pages = db.execute(entry_sql, {'start_dt': start_dt}).fetchall()
|
entry_pages = db.execute(entry_sql, {'start_dt': start_dt}).fetchall()
|
||||||
|
|
||||||
# Exit pages - last page in each session
|
# Exit pages - last page in each session (exclude bots)
|
||||||
exit_sql = text("""
|
exit_sql = text("""
|
||||||
WITH last_pages AS (
|
WITH last_pages AS (
|
||||||
SELECT DISTINCT ON (session_id) path
|
SELECT DISTINCT ON (pv.session_id) pv.path
|
||||||
FROM page_views
|
FROM page_views pv
|
||||||
WHERE viewed_at >= :start_dt AND session_id IS NOT NULL
|
JOIN user_sessions us ON pv.session_id = us.id
|
||||||
ORDER BY session_id, viewed_at DESC
|
WHERE pv.viewed_at >= :start_dt AND pv.session_id IS NOT NULL AND us.is_bot = false
|
||||||
|
ORDER BY pv.session_id, pv.viewed_at DESC
|
||||||
)
|
)
|
||||||
SELECT path, COUNT(*) as cnt
|
SELECT path, COUNT(*) as cnt
|
||||||
FROM last_pages
|
FROM last_pages
|
||||||
@ -567,13 +605,14 @@ def _tab_paths(db, start_date, days):
|
|||||||
max_entry = entry_pages[0].cnt if entry_pages else 1
|
max_entry = entry_pages[0].cnt if entry_pages else 1
|
||||||
max_exit = exit_pages[0].cnt if exit_pages else 1
|
max_exit = exit_pages[0].cnt if exit_pages else 1
|
||||||
|
|
||||||
# Top transitions
|
# Top transitions (exclude bots)
|
||||||
transitions_sql = text("""
|
transitions_sql = text("""
|
||||||
WITH ordered AS (
|
WITH ordered AS (
|
||||||
SELECT session_id, path,
|
SELECT pv.session_id, pv.path,
|
||||||
LEAD(path) OVER (PARTITION BY session_id ORDER BY viewed_at) AS next_path
|
LEAD(pv.path) OVER (PARTITION BY pv.session_id ORDER BY pv.viewed_at) AS next_path
|
||||||
FROM page_views
|
FROM page_views pv
|
||||||
WHERE viewed_at >= :start_dt AND session_id IS NOT NULL
|
JOIN user_sessions us ON pv.session_id = us.id
|
||||||
|
WHERE pv.viewed_at >= :start_dt AND pv.session_id IS NOT NULL AND us.is_bot = false
|
||||||
)
|
)
|
||||||
SELECT path, next_path, COUNT(*) as cnt
|
SELECT path, next_path, COUNT(*) as cnt
|
||||||
FROM ordered
|
FROM ordered
|
||||||
@ -582,21 +621,23 @@ def _tab_paths(db, start_date, days):
|
|||||||
""")
|
""")
|
||||||
transitions = db.execute(transitions_sql, {'start_dt': start_dt}).fetchall()
|
transitions = db.execute(transitions_sql, {'start_dt': start_dt}).fetchall()
|
||||||
|
|
||||||
# Drop-off pages (high exit rate)
|
# Drop-off pages (high exit rate, exclude bots)
|
||||||
dropoff_sql = text("""
|
dropoff_sql = text("""
|
||||||
WITH page_stats AS (
|
WITH page_stats AS (
|
||||||
SELECT path, COUNT(*) as total_views
|
SELECT pv.path, COUNT(*) as total_views
|
||||||
FROM page_views
|
FROM page_views pv
|
||||||
WHERE viewed_at >= :start_dt AND session_id IS NOT NULL
|
JOIN user_sessions us ON pv.session_id = us.id
|
||||||
GROUP BY path HAVING COUNT(*) >= 5
|
WHERE pv.viewed_at >= :start_dt AND pv.session_id IS NOT NULL AND us.is_bot = false
|
||||||
|
GROUP BY pv.path HAVING COUNT(*) >= 5
|
||||||
),
|
),
|
||||||
exit_stats AS (
|
exit_stats AS (
|
||||||
SELECT path, COUNT(*) as exit_count
|
SELECT path, COUNT(*) as exit_count
|
||||||
FROM (
|
FROM (
|
||||||
SELECT DISTINCT ON (session_id) path
|
SELECT DISTINCT ON (pv.session_id) pv.path
|
||||||
FROM page_views
|
FROM page_views pv
|
||||||
WHERE viewed_at >= :start_dt AND session_id IS NOT NULL
|
JOIN user_sessions us ON pv.session_id = us.id
|
||||||
ORDER BY session_id, viewed_at DESC
|
WHERE pv.viewed_at >= :start_dt AND pv.session_id IS NOT NULL AND us.is_bot = false
|
||||||
|
ORDER BY pv.session_id, pv.viewed_at DESC
|
||||||
) lp
|
) lp
|
||||||
GROUP BY path
|
GROUP BY path
|
||||||
)
|
)
|
||||||
@ -609,7 +650,7 @@ def _tab_paths(db, start_date, days):
|
|||||||
""")
|
""")
|
||||||
dropoff = db.execute(dropoff_sql, {'start_dt': start_dt}).fetchall()
|
dropoff = db.execute(dropoff_sql, {'start_dt': start_dt}).fetchall()
|
||||||
|
|
||||||
# Session length distribution
|
# Session length distribution (exclude bots)
|
||||||
session_length_sql = text("""
|
session_length_sql = text("""
|
||||||
SELECT
|
SELECT
|
||||||
CASE
|
CASE
|
||||||
@ -621,10 +662,11 @@ def _tab_paths(db, start_date, days):
|
|||||||
END as bucket,
|
END as bucket,
|
||||||
COUNT(*) as cnt
|
COUNT(*) as cnt
|
||||||
FROM (
|
FROM (
|
||||||
SELECT session_id, COUNT(*) as pv_count
|
SELECT pv.session_id, COUNT(*) as pv_count
|
||||||
FROM page_views
|
FROM page_views pv
|
||||||
WHERE viewed_at >= :start_dt AND session_id IS NOT NULL
|
JOIN user_sessions us ON pv.session_id = us.id
|
||||||
GROUP BY session_id
|
WHERE pv.viewed_at >= :start_dt AND pv.session_id IS NOT NULL AND us.is_bot = false
|
||||||
|
GROUP BY pv.session_id
|
||||||
) session_counts
|
) session_counts
|
||||||
GROUP BY bucket
|
GROUP BY bucket
|
||||||
ORDER BY MIN(pv_count)
|
ORDER BY MIN(pv_count)
|
||||||
@ -651,14 +693,13 @@ def _tab_overview(db, start_date, days):
|
|||||||
start_dt = datetime.combine(start_date, datetime.min.time())
|
start_dt = datetime.combine(start_date, datetime.min.time())
|
||||||
start_30d = datetime.combine(date.today() - timedelta(days=30), datetime.min.time())
|
start_30d = datetime.combine(date.today() - timedelta(days=30), datetime.min.time())
|
||||||
|
|
||||||
# Daily sessions + page views (30d)
|
# Daily sessions from analytics_daily (already bot-filtered after migration)
|
||||||
daily_data = db.query(AnalyticsDaily).filter(
|
daily_data = db.query(AnalyticsDaily).filter(
|
||||||
AnalyticsDaily.date >= date.today() - timedelta(days=30)
|
AnalyticsDaily.date >= date.today() - timedelta(days=30)
|
||||||
).order_by(AnalyticsDaily.date).all()
|
).order_by(AnalyticsDaily.date).all()
|
||||||
|
|
||||||
chart_labels = []
|
chart_labels = []
|
||||||
chart_sessions = []
|
chart_sessions = []
|
||||||
chart_pageviews = []
|
|
||||||
for d in daily_data:
|
for d in daily_data:
|
||||||
chart_labels.append(d.date.strftime('%d.%m'))
|
chart_labels.append(d.date.strftime('%d.%m'))
|
||||||
if filter_type == 'logged':
|
if filter_type == 'logged':
|
||||||
@ -667,15 +708,36 @@ def _tab_overview(db, start_date, days):
|
|||||||
chart_sessions.append(d.anonymous_sessions or 0)
|
chart_sessions.append(d.anonymous_sessions or 0)
|
||||||
else:
|
else:
|
||||||
chart_sessions.append(d.total_sessions or 0)
|
chart_sessions.append(d.total_sessions or 0)
|
||||||
chart_pageviews.append(d.total_page_views or 0)
|
|
||||||
|
|
||||||
# Hourly heatmap (7 days x 24 hours)
|
# Daily page views from raw PageView + JOIN (bot-filtered, supports logged/anon filter)
|
||||||
|
pv_filter = [
|
||||||
|
PageView.viewed_at >= start_30d,
|
||||||
|
UserSession.is_bot == False,
|
||||||
|
]
|
||||||
|
if filter_type == 'logged':
|
||||||
|
pv_filter.append(UserSession.user_id.isnot(None))
|
||||||
|
elif filter_type == 'anonymous':
|
||||||
|
pv_filter.append(UserSession.user_id.is_(None))
|
||||||
|
|
||||||
|
pv_daily = db.query(
|
||||||
|
func.date(PageView.viewed_at).label('day'),
|
||||||
|
func.count(PageView.id).label('cnt')
|
||||||
|
).join(UserSession, PageView.session_id == UserSession.id).filter(
|
||||||
|
*pv_filter
|
||||||
|
).group_by(func.date(PageView.viewed_at)).all()
|
||||||
|
|
||||||
|
pv_by_date = {str(r.day): r.cnt for r in pv_daily}
|
||||||
|
chart_pageviews = []
|
||||||
|
for d in daily_data:
|
||||||
|
chart_pageviews.append(pv_by_date.get(str(d.date), 0))
|
||||||
|
|
||||||
|
# Hourly heatmap (7 days x 24 hours, exclude bots)
|
||||||
heatmap_sql = text("""
|
heatmap_sql = text("""
|
||||||
SELECT EXTRACT(DOW FROM started_at)::int as dow,
|
SELECT EXTRACT(DOW FROM started_at)::int as dow,
|
||||||
EXTRACT(HOUR FROM started_at)::int as hour,
|
EXTRACT(HOUR FROM started_at)::int as hour,
|
||||||
COUNT(*) as cnt
|
COUNT(*) as cnt
|
||||||
FROM user_sessions
|
FROM user_sessions
|
||||||
WHERE started_at >= :start_dt
|
WHERE started_at >= :start_dt AND is_bot = false
|
||||||
GROUP BY dow, hour
|
GROUP BY dow, hour
|
||||||
""")
|
""")
|
||||||
heatmap_raw = db.execute(heatmap_sql, {'start_dt': start_30d}).fetchall()
|
heatmap_raw = db.execute(heatmap_sql, {'start_dt': start_30d}).fetchall()
|
||||||
@ -697,23 +759,25 @@ def _tab_overview(db, start_date, days):
|
|||||||
row['hours'].append({'count': cnt, 'intensity': intensity})
|
row['hours'].append({'count': cnt, 'intensity': intensity})
|
||||||
heatmap_grid.append(row)
|
heatmap_grid.append(row)
|
||||||
|
|
||||||
# Logged vs Anonymous
|
# Logged vs Anonymous (exclude bots)
|
||||||
total_logged = db.query(func.count(UserSession.id)).filter(
|
total_logged = db.query(func.count(UserSession.id)).filter(
|
||||||
UserSession.started_at >= start_30d,
|
UserSession.started_at >= start_30d,
|
||||||
UserSession.user_id.isnot(None)
|
UserSession.user_id.isnot(None),
|
||||||
|
UserSession.is_bot == False
|
||||||
).scalar() or 0
|
).scalar() or 0
|
||||||
total_anon = db.query(func.count(UserSession.id)).filter(
|
total_anon = db.query(func.count(UserSession.id)).filter(
|
||||||
UserSession.started_at >= start_30d,
|
UserSession.started_at >= start_30d,
|
||||||
UserSession.user_id.is_(None)
|
UserSession.user_id.is_(None),
|
||||||
|
UserSession.is_bot == False
|
||||||
).scalar() or 0
|
).scalar() or 0
|
||||||
|
|
||||||
# Devices over time (weekly)
|
# Devices over time (weekly, exclude bots)
|
||||||
devices_sql = text("""
|
devices_sql = text("""
|
||||||
SELECT DATE_TRUNC('week', started_at)::date as week,
|
SELECT DATE_TRUNC('week', started_at)::date as week,
|
||||||
device_type,
|
device_type,
|
||||||
COUNT(*) as cnt
|
COUNT(*) as cnt
|
||||||
FROM user_sessions
|
FROM user_sessions
|
||||||
WHERE started_at >= :start_dt
|
WHERE started_at >= :start_dt AND is_bot = false
|
||||||
GROUP BY week, device_type
|
GROUP BY week, device_type
|
||||||
ORDER BY week
|
ORDER BY week
|
||||||
""")
|
""")
|
||||||
@ -793,13 +857,16 @@ def user_insights_profile(user_id):
|
|||||||
SearchQuery.user_id == user_id, SearchQuery.searched_at >= start_30d
|
SearchQuery.user_id == user_id, SearchQuery.searched_at >= start_30d
|
||||||
).scalar() or 0
|
).scalar() or 0
|
||||||
|
|
||||||
engagement_score = min(100, int(
|
raw = (s30 * 3 + pv30 * 1 + int(clicks30) * 0.5 +
|
||||||
s30 * 3 + pv30 * 1 + int(clicks30) * 0.5 +
|
int(dur30) / 60 * 2 + conv30 * 10 + search30 * 2)
|
||||||
int(dur30) / 60 * 2 + conv30 * 10 + search30 * 2
|
engagement_score = _log_engagement_score(raw)
|
||||||
))
|
|
||||||
|
|
||||||
# Problem score
|
# Problem score (failed logins from audit_logs, time-based)
|
||||||
fl = user.failed_login_attempts or 0
|
fl = db.query(func.count(AuditLog.id)).filter(
|
||||||
|
AuditLog.user_email == user.email,
|
||||||
|
AuditLog.action == 'login_failed',
|
||||||
|
AuditLog.created_at >= start_7d
|
||||||
|
).scalar() or 0
|
||||||
sa_7d = db.query(func.count(SecurityAlert.id)).filter(
|
sa_7d = db.query(func.count(SecurityAlert.id)).filter(
|
||||||
SecurityAlert.user_email == user.email,
|
SecurityAlert.user_email == user.email,
|
||||||
SecurityAlert.created_at >= start_7d
|
SecurityAlert.created_at >= start_7d
|
||||||
@ -1166,7 +1233,7 @@ def user_insights_profile(user_id):
|
|||||||
PageView.viewed_at < d_end
|
PageView.viewed_at < d_end
|
||||||
).scalar() or 0
|
).scalar() or 0
|
||||||
|
|
||||||
daily_score = min(30, d_sessions * 3 + d_pv)
|
daily_score = _log_engagement_score(d_sessions * 3 + d_pv)
|
||||||
trend_labels.append(d.strftime('%d.%m'))
|
trend_labels.append(d.strftime('%d.%m'))
|
||||||
trend_scores.append(daily_score)
|
trend_scores.append(daily_score)
|
||||||
|
|
||||||
|
|||||||
@ -4144,6 +4144,9 @@ class UserSession(Base):
|
|||||||
page_views_count = Column(Integer, default=0)
|
page_views_count = Column(Integer, default=0)
|
||||||
clicks_count = Column(Integer, default=0)
|
clicks_count = Column(Integer, default=0)
|
||||||
|
|
||||||
|
# Bot detection
|
||||||
|
is_bot = Column(Boolean, default=False)
|
||||||
|
|
||||||
# UTM Parameters (kampanie marketingowe)
|
# UTM Parameters (kampanie marketingowe)
|
||||||
utm_source = Column(String(255), nullable=True) # google, facebook, newsletter
|
utm_source = Column(String(255), nullable=True) # google, facebook, newsletter
|
||||||
utm_medium = Column(String(255), nullable=True) # cpc, email, social, organic
|
utm_medium = Column(String(255), nullable=True) # cpc, email, social, organic
|
||||||
|
|||||||
98
database/migrations/079_bot_filtering.sql
Normal file
98
database/migrations/079_bot_filtering.sql
Normal file
@ -0,0 +1,98 @@
|
|||||||
|
-- Migration 079: Bot Filtering for Analytics
|
||||||
|
-- Adds is_bot column to user_sessions, backfills from user_agent patterns,
|
||||||
|
-- updates analytics_daily trigger to exclude bots, recalculates 90 days of data.
|
||||||
|
|
||||||
|
-- 1. Add column
|
||||||
|
ALTER TABLE user_sessions ADD COLUMN IF NOT EXISTS is_bot BOOLEAN DEFAULT false;
|
||||||
|
|
||||||
|
-- 2. Backfill from user_agent patterns
|
||||||
|
UPDATE user_sessions SET is_bot = true
|
||||||
|
WHERE user_agent ILIKE '%bot%'
|
||||||
|
OR user_agent ILIKE '%crawler%'
|
||||||
|
OR user_agent ILIKE '%spider%'
|
||||||
|
OR user_agent ILIKE '%curl/%'
|
||||||
|
OR user_agent ILIKE '%python-requests%'
|
||||||
|
OR user_agent ILIKE '%axios/%'
|
||||||
|
OR user_agent ILIKE '%wget/%'
|
||||||
|
OR user_agent ILIKE '%Scrapy%'
|
||||||
|
OR user_agent ILIKE '%Java/%'
|
||||||
|
OR user_agent ILIKE '%Go-http%'
|
||||||
|
OR user_agent ILIKE '%Werkzeug%'
|
||||||
|
OR user_agent ILIKE '%LeakIx%'
|
||||||
|
OR user_agent ILIKE '%Nuclei%'
|
||||||
|
OR user_agent ILIKE '%masscan%'
|
||||||
|
OR user_agent ILIKE '%nmap%'
|
||||||
|
OR user_agent ILIKE '%zgrab%'
|
||||||
|
OR user_agent ILIKE '%httpx%'
|
||||||
|
OR user_agent ILIKE '%censys%'
|
||||||
|
OR user_agent ILIKE '%shodan%'
|
||||||
|
OR user_agent IS NULL;
|
||||||
|
|
||||||
|
-- 3. Partial index for non-bot sessions (most queries filter on this)
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_us_is_bot ON user_sessions(is_bot) WHERE is_bot = false;
|
||||||
|
|
||||||
|
-- 4. Update analytics_daily trigger to skip bot sessions
|
||||||
|
CREATE OR REPLACE FUNCTION update_analytics_daily()
|
||||||
|
RETURNS TRIGGER AS $$
|
||||||
|
DECLARE target_date DATE;
|
||||||
|
BEGIN
|
||||||
|
IF TG_TABLE_NAME = 'user_sessions' THEN
|
||||||
|
IF NEW.is_bot = true THEN RETURN NEW; END IF;
|
||||||
|
target_date := DATE(NEW.started_at);
|
||||||
|
ELSIF TG_TABLE_NAME = 'page_views' THEN
|
||||||
|
IF NEW.session_id IS NOT NULL THEN
|
||||||
|
IF EXISTS (SELECT 1 FROM user_sessions WHERE id = NEW.session_id AND is_bot = true) THEN
|
||||||
|
RETURN NEW;
|
||||||
|
END IF;
|
||||||
|
END IF;
|
||||||
|
target_date := DATE(NEW.viewed_at);
|
||||||
|
ELSE RETURN NEW;
|
||||||
|
END IF;
|
||||||
|
|
||||||
|
INSERT INTO analytics_daily (date, total_sessions, total_page_views, updated_at)
|
||||||
|
VALUES (target_date, 0, 0, NOW()) ON CONFLICT (date) DO NOTHING;
|
||||||
|
|
||||||
|
IF TG_TABLE_NAME = 'user_sessions' THEN
|
||||||
|
UPDATE analytics_daily SET
|
||||||
|
total_sessions = total_sessions + 1,
|
||||||
|
unique_users = (SELECT COUNT(DISTINCT user_id) FROM user_sessions
|
||||||
|
WHERE DATE(started_at) = target_date AND user_id IS NOT NULL AND is_bot = false),
|
||||||
|
anonymous_sessions = (SELECT COUNT(*) FROM user_sessions
|
||||||
|
WHERE DATE(started_at) = target_date AND user_id IS NULL AND is_bot = false),
|
||||||
|
desktop_sessions = (SELECT COUNT(*) FROM user_sessions
|
||||||
|
WHERE DATE(started_at) = target_date AND device_type = 'desktop' AND is_bot = false),
|
||||||
|
mobile_sessions = (SELECT COUNT(*) FROM user_sessions
|
||||||
|
WHERE DATE(started_at) = target_date AND device_type = 'mobile' AND is_bot = false),
|
||||||
|
tablet_sessions = (SELECT COUNT(*) FROM user_sessions
|
||||||
|
WHERE DATE(started_at) = target_date AND device_type = 'tablet' AND is_bot = false),
|
||||||
|
updated_at = NOW()
|
||||||
|
WHERE date = target_date;
|
||||||
|
ELSIF TG_TABLE_NAME = 'page_views' THEN
|
||||||
|
UPDATE analytics_daily SET total_page_views = total_page_views + 1, updated_at = NOW()
|
||||||
|
WHERE date = target_date;
|
||||||
|
END IF;
|
||||||
|
RETURN NEW;
|
||||||
|
END;
|
||||||
|
$$ LANGUAGE plpgsql;
|
||||||
|
|
||||||
|
-- 5. Recalculate analytics_daily for last 90 days (remove bot contamination)
|
||||||
|
UPDATE analytics_daily ad SET
|
||||||
|
total_sessions = (SELECT COUNT(*) FROM user_sessions
|
||||||
|
WHERE DATE(started_at) = ad.date AND is_bot = false),
|
||||||
|
total_page_views = (SELECT COUNT(*) FROM page_views pv
|
||||||
|
JOIN user_sessions us ON pv.session_id = us.id
|
||||||
|
WHERE DATE(pv.viewed_at) = ad.date AND us.is_bot = false),
|
||||||
|
unique_users = (SELECT COUNT(DISTINCT user_id) FROM user_sessions
|
||||||
|
WHERE DATE(started_at) = ad.date AND user_id IS NOT NULL AND is_bot = false),
|
||||||
|
anonymous_sessions = (SELECT COUNT(*) FROM user_sessions
|
||||||
|
WHERE DATE(started_at) = ad.date AND user_id IS NULL AND is_bot = false),
|
||||||
|
desktop_sessions = (SELECT COUNT(*) FROM user_sessions
|
||||||
|
WHERE DATE(started_at) = ad.date AND device_type = 'desktop' AND is_bot = false),
|
||||||
|
mobile_sessions = (SELECT COUNT(*) FROM user_sessions
|
||||||
|
WHERE DATE(started_at) = ad.date AND device_type = 'mobile' AND is_bot = false),
|
||||||
|
tablet_sessions = (SELECT COUNT(*) FROM user_sessions
|
||||||
|
WHERE DATE(started_at) = ad.date AND device_type = 'tablet' AND is_bot = false)
|
||||||
|
WHERE ad.date >= CURRENT_DATE - 90;
|
||||||
|
|
||||||
|
-- 6. Grants
|
||||||
|
GRANT ALL ON TABLE user_sessions TO nordabiz_app;
|
||||||
@ -54,12 +54,16 @@ def get_or_create_analytics_session():
|
|||||||
browser_version = ua.browser.version_string
|
browser_version = ua.browser.version_string
|
||||||
os_name = ua.os.family
|
os_name = ua.os.family
|
||||||
os_version = ua.os.version_string
|
os_version = ua.os.version_string
|
||||||
|
is_bot = ua.is_bot or any(p in ua_string.lower() for p in
|
||||||
|
['curl/', 'python-requests', 'axios/', 'wget/', 'scrapy',
|
||||||
|
'werkzeug', 'leakix', 'nuclei', 'masscan', 'zgrab', 'httpx'])
|
||||||
except Exception:
|
except Exception:
|
||||||
device_type = 'desktop'
|
device_type = 'desktop'
|
||||||
browser = 'Unknown'
|
browser = 'Unknown'
|
||||||
browser_version = ''
|
browser_version = ''
|
||||||
os_name = 'Unknown'
|
os_name = 'Unknown'
|
||||||
os_version = ''
|
os_version = ''
|
||||||
|
is_bot = False
|
||||||
|
|
||||||
user_session = UserSession(
|
user_session = UserSession(
|
||||||
session_id=analytics_session_id,
|
session_id=analytics_session_id,
|
||||||
@ -70,7 +74,8 @@ def get_or_create_analytics_session():
|
|||||||
browser=browser[:50] if browser else None,
|
browser=browser[:50] if browser else None,
|
||||||
browser_version=browser_version[:20] if browser_version else None,
|
browser_version=browser_version[:20] if browser_version else None,
|
||||||
os=os_name[:50] if os_name else None,
|
os=os_name[:50] if os_name else None,
|
||||||
os_version=os_version[:20] if os_version else None
|
os_version=os_version[:20] if os_version else None,
|
||||||
|
is_bot=is_bot
|
||||||
)
|
)
|
||||||
db.add(user_session)
|
db.add(user_session)
|
||||||
db.commit()
|
db.commit()
|
||||||
|
|||||||
@ -77,6 +77,13 @@ def register_middleware(app):
|
|||||||
if request.path == '/favicon.ico':
|
if request.path == '/favicon.ico':
|
||||||
return
|
return
|
||||||
|
|
||||||
|
# Skip bot/AJAX utility paths
|
||||||
|
skip_exact = {'/robots.txt', '/sitemap.xml', '/manifest.json',
|
||||||
|
'/check-verification-status', '/resend-verification'}
|
||||||
|
skip_prefixes = ('/.well-known/',)
|
||||||
|
if request.path in skip_exact or any(request.path.startswith(p) for p in skip_prefixes):
|
||||||
|
return
|
||||||
|
|
||||||
try:
|
try:
|
||||||
from utils.analytics import (
|
from utils.analytics import (
|
||||||
track_page_view_for_request,
|
track_page_view_for_request,
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user