feat(zopk): Implementacja łączenia duplikatów encji (Priorytet 4)

Nowe funkcje w zopk_knowledge_service.py:
- find_duplicate_entities() - wyszukiwanie podobnych encji (pg_trgm)
- merge_entities() - łączenie encji z transferem relacji
- get_entity_merge_preview() - podgląd przed połączeniem

Nowe endpointy w app.py:
- GET /admin/zopk/knowledge/duplicates - panel zarządzania duplikatami
- POST /api/zopk/knowledge/duplicates/preview - podgląd merge
- POST /api/zopk/knowledge/duplicates/merge - wykonanie merge

Nowy szablon:
- templates/admin/zopk_knowledge_duplicates.html - UI z kartami encji

Dodatkowo:
- Aktualizacja CLAUDE.md z procedurą wdrażania
- Skrypt scripts/run_migration.py do uruchamiania migracji SQL

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Maciej Pienczyn 2026-01-17 09:09:22 +01:00
parent 6d1f75bce5
commit 143f5c674a
5 changed files with 1135 additions and 0 deletions

View File

@ -200,6 +200,40 @@ ssh maciejpi@10.22.68.249 "cd /var/www/nordabiznes && sudo -u www-data git pull
- **User:** www-data
- **SSL verify:** disabled (`git -c http.sslVerify=false`)
### Procedura wdrażania (WAŻNE!)
**Pełna procedura wdrażania z migracjami SQL:**
```bash
# 1. DEV: Push do obu repozytoriów
git push origin master && git push inpi master
# 2. PROD: Pull zmiany
ssh maciejpi@10.22.68.249 "cd /var/www/nordabiznes && sudo -u www-data git pull"
# 3. PROD: Uruchom migracje SQL (jeśli są)
ssh maciejpi@10.22.68.249 "cd /var/www/nordabiznes && /var/www/nordabiznes/venv/bin/python3 scripts/run_migration.py database/migrations/XXX_nazwa.sql"
# 4. PROD: Restart serwisu
ssh maciejpi@10.22.68.249 "sudo systemctl restart nordabiznes"
# 5. Weryfikacja
curl -sI https://nordabiznes.pl/health | head -3
```
**⚠️ UWAGI KRYTYCZNE:**
1. **Migracje SQL** - NIE używaj `psql` bezpośrednio (wymaga hasła). Użyj skryptu `scripts/run_migration.py` który czyta DATABASE_URL z `.env`.
2. **Uprawnienia logów** - Serwis działa jako `maciejpi` (nie `www-data`). Jeśli pojawi się błąd `Permission denied: /var/log/nordabiznes/*`:
```bash
ssh maciejpi@10.22.68.249 "sudo chown -R maciejpi:maciejpi /var/log/nordabiznes/"
```
3. **502 po restarcie** - Czasami występuje chwilowy 502. Poczekaj 3-5 sekund i sprawdź ponownie.
4. **Git pull** - Używaj `sudo -u www-data git pull` (www-data ma dostęp do kluczy SSH).
## Auto Claude - Konfiguracja i rozwiązywanie problemów
### Pliki stanu Auto Claude (WAŻNE!)

96
app.py
View File

@ -11810,6 +11810,102 @@ def api_zopk_chunk_delete(chunk_id):
db.close()
# ============================================================
# ZOPK ENTITY DUPLICATE MANAGEMENT
# ============================================================
@app.route('/admin/zopk/knowledge/duplicates')
@login_required
def admin_zopk_knowledge_duplicates():
"""Admin page for managing duplicate entities."""
if not current_user.is_admin:
flash('Brak uprawnień do tej strony.', 'error')
return redirect(url_for('dashboard'))
from zopk_knowledge_service import find_duplicate_entities
db = SessionLocal()
try:
# Get filter parameters
entity_type = request.args.get('entity_type', '')
min_similarity = float(request.args.get('min_similarity', 0.4))
# Find duplicates
duplicates = find_duplicate_entities(
db,
entity_type=entity_type if entity_type else None,
min_similarity=min_similarity,
limit=100
)
# Get unique entity types for filter
from database import ZOPKKnowledgeEntity
from sqlalchemy import distinct
entity_types = [r[0] for r in db.query(distinct(ZOPKKnowledgeEntity.entity_type)).all()]
return render_template(
'admin/zopk_knowledge_duplicates.html',
duplicates=duplicates,
entity_types=sorted(entity_types),
selected_type=entity_type,
min_similarity=min_similarity
)
finally:
db.close()
@app.route('/api/zopk/knowledge/duplicates/preview', methods=['POST'])
@login_required
def api_zopk_duplicates_preview():
"""Preview merge operation between two entities."""
if not current_user.is_admin:
return jsonify({'success': False, 'error': 'Brak uprawnień'}), 403
from zopk_knowledge_service import get_entity_merge_preview
db = SessionLocal()
try:
data = request.get_json() or {}
primary_id = data.get('primary_id')
duplicate_id = data.get('duplicate_id')
if not primary_id or not duplicate_id:
return jsonify({'success': False, 'error': 'Brak ID encji'}), 400
preview = get_entity_merge_preview(db, primary_id, duplicate_id)
if 'error' in preview:
return jsonify({'success': False, 'error': preview['error']}), 404
return jsonify({'success': True, 'preview': preview})
finally:
db.close()
@app.route('/api/zopk/knowledge/duplicates/merge', methods=['POST'])
@login_required
def api_zopk_duplicates_merge():
"""Merge two entities - keep primary, delete duplicate."""
if not current_user.is_admin:
return jsonify({'success': False, 'error': 'Brak uprawnień'}), 403
from zopk_knowledge_service import merge_entities
db = SessionLocal()
try:
data = request.get_json() or {}
primary_id = data.get('primary_id')
duplicate_id = data.get('duplicate_id')
new_name = data.get('new_name')
if not primary_id or not duplicate_id:
return jsonify({'success': False, 'error': 'Brak ID encji'}), 400
result = merge_entities(db, primary_id, duplicate_id, new_name)
return jsonify(result)
finally:
db.close()
# ============================================================
# KRS AUDIT (Krajowy Rejestr Sądowy)
# ============================================================

View File

@ -324,6 +324,13 @@
<div class="quick-link-desc">Firmy, osoby, miejsca, projekty</div>
</div>
</a>
<a href="{{ url_for('admin_zopk_knowledge_duplicates') }}" class="quick-link" style="border-color: #f59e0b;">
<div class="quick-link-icon">🔀</div>
<div class="quick-link-text">
<div class="quick-link-title">Duplikaty</div>
<div class="quick-link-desc">Łączenie podobnych encji</div>
</div>
</a>
<a href="{{ url_for('admin_zopk_news') }}" class="quick-link">
<div class="quick-link-icon">📰</div>
<div class="quick-link-text">

View File

@ -0,0 +1,701 @@
{% extends "base.html" %}
{% block title %}Duplikaty Encji - ZOPK Baza Wiedzy{% endblock %}
{% block extra_css %}
<style>
.page-header {
display: flex;
justify-content: space-between;
align-items: center;
margin-bottom: var(--spacing-xl);
}
.page-header h1 {
font-size: var(--font-size-2xl);
color: var(--text-primary);
}
.breadcrumb {
display: flex;
gap: var(--spacing-xs);
color: var(--text-secondary);
font-size: var(--font-size-sm);
margin-bottom: var(--spacing-lg);
}
.breadcrumb a {
color: var(--primary);
text-decoration: none;
}
.breadcrumb a:hover {
text-decoration: underline;
}
.filters-bar {
display: flex;
gap: var(--spacing-md);
align-items: center;
margin-bottom: var(--spacing-xl);
padding: var(--spacing-md);
background: var(--surface);
border-radius: var(--radius);
box-shadow: var(--shadow);
flex-wrap: wrap;
}
.filter-group {
display: flex;
align-items: center;
gap: var(--spacing-xs);
}
.filter-group label {
font-size: var(--font-size-sm);
color: var(--text-secondary);
}
.filter-group select,
.filter-group input {
padding: 6px 12px;
border: 1px solid var(--border);
border-radius: var(--radius-sm);
font-size: var(--font-size-sm);
}
.duplicates-list {
display: flex;
flex-direction: column;
gap: var(--spacing-lg);
}
.duplicate-card {
background: var(--surface);
border-radius: var(--radius-lg);
box-shadow: var(--shadow);
overflow: hidden;
}
.duplicate-header {
display: flex;
justify-content: space-between;
align-items: center;
padding: var(--spacing-md) var(--spacing-lg);
background: linear-gradient(135deg, #fef3c7 0%, #fde68a 100%);
border-bottom: 1px solid #fbbf24;
}
.duplicate-type {
display: flex;
align-items: center;
gap: var(--spacing-sm);
font-weight: 600;
color: #92400e;
}
.similarity-badge {
padding: 4px 10px;
border-radius: var(--radius);
font-size: var(--font-size-sm);
font-weight: 600;
}
.similarity-high {
background: #dcfce7;
color: #166534;
}
.similarity-medium {
background: #fef3c7;
color: #92400e;
}
.similarity-low {
background: #fee2e2;
color: #991b1b;
}
.duplicate-body {
display: grid;
grid-template-columns: 1fr auto 1fr;
gap: var(--spacing-lg);
padding: var(--spacing-lg);
}
.entity-card {
padding: var(--spacing-md);
background: var(--background);
border-radius: var(--radius);
border: 2px solid transparent;
cursor: pointer;
transition: var(--transition);
}
.entity-card:hover {
border-color: var(--primary);
}
.entity-card.selected {
border-color: var(--primary);
background: #f0fdf4;
}
.entity-card.selected-duplicate {
border-color: #ef4444;
background: #fee2e2;
}
.entity-name {
font-size: var(--font-size-lg);
font-weight: 600;
color: var(--text-primary);
margin-bottom: var(--spacing-xs);
}
.entity-meta {
display: flex;
gap: var(--spacing-md);
font-size: var(--font-size-sm);
color: var(--text-secondary);
}
.entity-meta span {
display: flex;
align-items: center;
gap: 4px;
}
.merge-arrow {
display: flex;
flex-direction: column;
align-items: center;
justify-content: center;
gap: var(--spacing-sm);
}
.merge-arrow svg {
width: 32px;
height: 32px;
color: var(--primary);
}
.merge-arrow span {
font-size: var(--font-size-xs);
color: var(--text-muted);
}
.duplicate-actions {
display: flex;
justify-content: flex-end;
gap: var(--spacing-md);
padding: var(--spacing-md) var(--spacing-lg);
border-top: 1px solid var(--border);
background: var(--background);
}
.btn {
display: inline-flex;
align-items: center;
gap: var(--spacing-xs);
padding: 8px 16px;
border-radius: var(--radius);
font-size: var(--font-size-sm);
font-weight: 500;
cursor: pointer;
transition: var(--transition);
text-decoration: none;
border: none;
}
.btn-primary {
background: var(--primary);
color: white;
}
.btn-primary:hover {
background: var(--primary-dark);
}
.btn-danger {
background: #ef4444;
color: white;
}
.btn-danger:hover {
background: #dc2626;
}
.btn-secondary {
background: var(--background);
color: var(--text-primary);
border: 1px solid var(--border);
}
.btn-secondary:hover {
background: var(--surface);
}
.empty-state {
text-align: center;
padding: var(--spacing-2xl);
color: var(--text-secondary);
}
.empty-state svg {
width: 64px;
height: 64px;
color: var(--text-muted);
margin-bottom: var(--spacing-md);
}
/* Modal */
.modal-overlay {
display: none;
position: fixed;
top: 0;
left: 0;
right: 0;
bottom: 0;
background: rgba(0,0,0,0.5);
z-index: 1000;
justify-content: center;
align-items: center;
}
.modal-overlay.active {
display: flex;
}
.modal {
background: var(--surface);
border-radius: var(--radius-lg);
max-width: 600px;
width: 90%;
max-height: 80vh;
overflow-y: auto;
box-shadow: var(--shadow-lg);
}
.modal-header {
padding: var(--spacing-lg);
border-bottom: 1px solid var(--border);
}
.modal-header h2 {
font-size: var(--font-size-xl);
}
.modal-body {
padding: var(--spacing-lg);
}
.modal-footer {
padding: var(--spacing-lg);
border-top: 1px solid var(--border);
display: flex;
justify-content: flex-end;
gap: var(--spacing-md);
}
.preview-section {
margin-bottom: var(--spacing-lg);
}
.preview-section h4 {
font-size: var(--font-size-sm);
color: var(--text-secondary);
margin-bottom: var(--spacing-sm);
}
.preview-entities {
display: grid;
grid-template-columns: 1fr auto 1fr;
gap: var(--spacing-md);
align-items: center;
margin-bottom: var(--spacing-lg);
}
.preview-entity {
padding: var(--spacing-md);
background: var(--background);
border-radius: var(--radius);
}
.preview-entity.keep {
border: 2px solid var(--primary);
}
.preview-entity.delete {
border: 2px solid #ef4444;
opacity: 0.7;
}
.preview-stats {
display: grid;
grid-template-columns: repeat(auto-fill, minmax(120px, 1fr));
gap: var(--spacing-sm);
}
.preview-stat {
padding: var(--spacing-sm);
background: var(--background);
border-radius: var(--radius-sm);
text-align: center;
}
.preview-stat-value {
font-size: var(--font-size-xl);
font-weight: 700;
color: var(--primary);
}
.preview-stat-label {
font-size: var(--font-size-xs);
color: var(--text-secondary);
}
.name-input {
width: 100%;
padding: var(--spacing-sm);
border: 1px solid var(--border);
border-radius: var(--radius);
font-size: var(--font-size-base);
margin-top: var(--spacing-xs);
}
@media (max-width: 768px) {
.duplicate-body {
grid-template-columns: 1fr;
}
.merge-arrow {
transform: rotate(90deg);
}
}
</style>
{% endblock %}
{% block content %}
<div class="container">
<div class="breadcrumb">
<a href="{{ url_for('admin_zopk') }}">Panel ZOPK</a>
<span></span>
<a href="{{ url_for('admin_zopk_knowledge_dashboard') }}">Baza Wiedzy</a>
<span></span>
<span>Duplikaty Encji</span>
</div>
<div class="page-header">
<h1>🔀 Duplikaty Encji</h1>
</div>
<div class="filters-bar">
<form method="get" style="display: contents;">
<div class="filter-group">
<label for="entity_type">Typ encji:</label>
<select name="entity_type" id="entity_type" onchange="this.form.submit()">
<option value="">Wszystkie</option>
{% for etype in entity_types %}
<option value="{{ etype }}" {% if etype == selected_type %}selected{% endif %}>{{ etype }}</option>
{% endfor %}
</select>
</div>
<div class="filter-group">
<label for="min_similarity">Min. podobieństwo:</label>
<input type="range" name="min_similarity" id="min_similarity"
min="0.3" max="0.9" step="0.1"
value="{{ min_similarity }}"
onchange="document.getElementById('sim_value').textContent = this.value; this.form.submit()">
<span id="sim_value">{{ min_similarity }}</span>
</div>
</form>
<div style="margin-left: auto;">
Znaleziono: <strong>{{ duplicates|length }}</strong> par
</div>
</div>
{% if duplicates %}
<div class="duplicates-list">
{% for dup in duplicates %}
<div class="duplicate-card" data-pair-id="{{ loop.index }}">
<div class="duplicate-header">
<div class="duplicate-type">
<span>{{ dup.entity1.entity_type }}</span>
</div>
<span class="similarity-badge {% if dup.similarity > 0.8 %}similarity-high{% elif dup.similarity > 0.6 %}similarity-medium{% else %}similarity-low{% endif %}">
{{ (dup.similarity * 100)|round|int }}% podobieństwo
{% if dup.match_type == 'substring' %}(substring){% endif %}
</span>
</div>
<div class="duplicate-body">
<div class="entity-card"
onclick="selectEntity(this, {{ dup.entity1.id }}, 'primary')"
data-id="{{ dup.entity1.id }}"
data-name="{{ dup.entity1.name }}">
<div class="entity-name">{{ dup.entity1.name }}</div>
<div class="entity-meta">
<span>📊 {{ dup.entity1.mentions_count }} wzmianek</span>
{% if dup.entity1.is_verified %}
<span>✅ Zweryfikowano</span>
{% endif %}
</div>
</div>
<div class="merge-arrow">
<svg viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
<line x1="5" y1="12" x2="19" y2="12"></line>
<polyline points="12 5 19 12 12 19"></polyline>
</svg>
<span>połącz</span>
</div>
<div class="entity-card"
onclick="selectEntity(this, {{ dup.entity2.id }}, 'duplicate')"
data-id="{{ dup.entity2.id }}"
data-name="{{ dup.entity2.name }}">
<div class="entity-name">{{ dup.entity2.name }}</div>
<div class="entity-meta">
<span>📊 {{ dup.entity2.mentions_count }} wzmianek</span>
{% if dup.entity2.is_verified %}
<span>✅ Zweryfikowano</span>
{% endif %}
</div>
</div>
</div>
<div class="duplicate-actions">
<button class="btn btn-secondary" onclick="skipPair({{ loop.index }})">
⏭️ Pomiń
</button>
<button class="btn btn-primary" onclick="openMergeModal({{ loop.index }}, {{ dup.entity1.id }}, {{ dup.entity2.id }})">
🔀 Połącz encje
</button>
</div>
</div>
{% endfor %}
</div>
{% else %}
<div class="empty-state">
<svg viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
<circle cx="12" cy="12" r="10"></circle>
<path d="M16 16s-1.5-2-4-2-4 2-4 2"></path>
<line x1="9" y1="9" x2="9.01" y2="9"></line>
<line x1="15" y1="9" x2="15.01" y2="9"></line>
</svg>
<h3>Brak duplikatów do wyświetlenia</h3>
<p>Spróbuj zmniejszyć próg podobieństwa lub wybierz inny typ encji.</p>
</div>
{% endif %}
</div>
<!-- Merge Preview Modal -->
<div class="modal-overlay" id="mergeModal">
<div class="modal">
<div class="modal-header">
<h2>🔀 Podgląd połączenia encji</h2>
</div>
<div class="modal-body" id="mergePreviewContent">
<p>Ładowanie...</p>
</div>
<div class="modal-footer">
<button class="btn btn-secondary" onclick="closeMergeModal()">Anuluj</button>
<button class="btn btn-danger" id="confirmMergeBtn" onclick="confirmMerge()">
🔀 Połącz encje
</button>
</div>
</div>
</div>
{% endblock %}
{% block extra_js %}
let currentPrimaryId = null;
let currentDuplicateId = null;
let currentNewName = null;
function selectEntity(element, id, role) {
const card = element.closest('.duplicate-card');
const entities = card.querySelectorAll('.entity-card');
// Reset selection
entities.forEach(e => {
e.classList.remove('selected', 'selected-duplicate');
});
// If primary clicked, mark it and mark other as duplicate
if (role === 'primary') {
element.classList.add('selected');
entities.forEach(e => {
if (e !== element) e.classList.add('selected-duplicate');
});
}
}
function skipPair(pairId) {
const card = document.querySelector(`[data-pair-id="${pairId}"]`);
card.style.opacity = '0.3';
card.style.pointerEvents = 'none';
}
function openMergeModal(pairId, id1, id2) {
const card = document.querySelector(`[data-pair-id="${pairId}"]`);
const entities = card.querySelectorAll('.entity-card');
// Get selected primary
let primaryId = id1;
let duplicateId = id2;
entities.forEach(e => {
if (e.classList.contains('selected')) {
primaryId = parseInt(e.dataset.id);
}
if (e.classList.contains('selected-duplicate')) {
duplicateId = parseInt(e.dataset.id);
}
});
// If nothing selected, use the one with more mentions
if (!card.querySelector('.selected')) {
const e1 = entities[0];
const e2 = entities[1];
e1.classList.add('selected');
e2.classList.add('selected-duplicate');
}
currentPrimaryId = primaryId;
currentDuplicateId = duplicateId;
// Show modal and fetch preview
document.getElementById('mergeModal').classList.add('active');
fetchMergePreview(primaryId, duplicateId);
}
function closeMergeModal() {
document.getElementById('mergeModal').classList.remove('active');
currentPrimaryId = null;
currentDuplicateId = null;
}
async function fetchMergePreview(primaryId, duplicateId) {
const content = document.getElementById('mergePreviewContent');
content.innerHTML = '<p>Ładowanie podglądu...</p>';
try {
const response = await fetch('/api/zopk/knowledge/duplicates/preview', {
method: 'POST',
headers: {
'Content-Type': 'application/json',
'X-CSRFToken': '{{ csrf_token() }}'
},
body: JSON.stringify({
primary_id: primaryId,
duplicate_id: duplicateId
})
});
const data = await response.json();
if (data.success) {
const p = data.preview;
currentNewName = p.primary.name;
content.innerHTML = `
<div class="preview-section">
<h4>Encje do połączenia</h4>
<div class="preview-entities">
<div class="preview-entity keep">
<strong>✅ Zachowaj</strong>
<div class="entity-name">${p.primary.name}</div>
<div class="entity-meta">
<span>📊 ${p.primary.mentions_count} wzmianek</span>
</div>
</div>
<svg width="32" height="32" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
<line x1="5" y1="12" x2="19" y2="12"></line>
<polyline points="12 5 19 12 12 19"></polyline>
</svg>
<div class="preview-entity delete">
<strong>🗑️ Usuń</strong>
<div class="entity-name">${p.duplicate.name}</div>
<div class="entity-meta">
<span>📊 ${p.duplicate.mentions_count} wzmianek</span>
</div>
</div>
</div>
</div>
<div class="preview-section">
<h4>Co zostanie przeniesione</h4>
<div class="preview-stats">
<div class="preview-stat">
<div class="preview-stat-value">${p.transfers.mentions}</div>
<div class="preview-stat-label">Wzmianki</div>
</div>
<div class="preview-stat">
<div class="preview-stat-value">${p.transfers.facts_subject + p.transfers.facts_object}</div>
<div class="preview-stat-label">Fakty</div>
</div>
<div class="preview-stat">
<div class="preview-stat-value">${p.transfers.relations_source + p.transfers.relations_target}</div>
<div class="preview-stat-label">Relacje</div>
</div>
<div class="preview-stat">
<div class="preview-stat-value">${p.result.new_mentions_count}</div>
<div class="preview-stat-label">Wynik wzmianek</div>
</div>
</div>
</div>
<div class="preview-section">
<h4>Nowa nazwa encji (opcjonalnie)</h4>
<input type="text" class="name-input" id="newNameInput"
value="${p.primary.name}"
placeholder="Pozostaw pustą aby zachować obecną nazwę">
</div>
`;
} else {
content.innerHTML = `<p style="color: #ef4444;">Błąd: ${data.error}</p>`;
}
} catch (error) {
content.innerHTML = `<p style="color: #ef4444;">Błąd połączenia: ${error.message}</p>`;
}
}
async function confirmMerge() {
const btn = document.getElementById('confirmMergeBtn');
btn.disabled = true;
btn.textContent = 'Łączenie...';
const newName = document.getElementById('newNameInput')?.value || null;
try {
const response = await fetch('/api/zopk/knowledge/duplicates/merge', {
method: 'POST',
headers: {
'Content-Type': 'application/json',
'X-CSRFToken': '{{ csrf_token() }}'
},
body: JSON.stringify({
primary_id: currentPrimaryId,
duplicate_id: currentDuplicateId,
new_name: newName !== currentNewName ? newName : null
})
});
const data = await response.json();
if (data.success) {
alert(`✅ Encje połączone!\n\nPrzeniesiono:\n- ${data.transfers.mentions} wzmianek\n- ${data.transfers.facts_subject + data.transfers.facts_object} faktów\n- ${data.transfers.relations_source + data.transfers.relations_target} relacji`);
closeMergeModal();
window.location.reload();
} else {
alert(`❌ Błąd: ${data.error}`);
btn.disabled = false;
btn.textContent = '🔀 Połącz encje';
}
} catch (error) {
alert(`❌ Błąd połączenia: ${error.message}`);
btn.disabled = false;
btn.textContent = '🔀 Połącz encje';
}
}
{% endblock %}

View File

@ -1653,3 +1653,300 @@ def delete_chunk(db_session, chunk_id: int) -> bool:
db_session.delete(chunk)
db_session.commit()
return True
# ============================================================
# DUPLICATE ENTITY DETECTION AND MERGING
# ============================================================
def find_duplicate_entities(
db_session,
entity_type: Optional[str] = None,
min_similarity: float = 0.5,
limit: int = 100
) -> List[Dict]:
"""
Find potential duplicate entities using fuzzy matching.
Uses PostgreSQL pg_trgm extension for similarity matching.
Returns pairs of entities that might be duplicates.
Args:
db_session: SQLAlchemy session
entity_type: Filter by entity type (company, person, etc.)
min_similarity: Minimum similarity threshold (0.0-1.0)
limit: Maximum number of pairs to return
Returns:
List of dicts with duplicate pairs:
[
{
'entity1': {...},
'entity2': {...},
'similarity': 0.85,
'match_type': 'fuzzy' # or 'substring'
}
]
"""
from sqlalchemy import text
# Build query with pg_trgm similarity
type_filter = f"AND e1.entity_type = '{entity_type}'" if entity_type else ""
query = text(f"""
SELECT
e1.id as id1, e1.name as name1, e1.entity_type as type1,
e1.mentions_count as mentions1, e1.is_verified as verified1,
e2.id as id2, e2.name as name2, e2.entity_type as type2,
e2.mentions_count as mentions2, e2.is_verified as verified2,
similarity(LOWER(e1.name), LOWER(e2.name)) as sim,
CASE
WHEN LOWER(e1.name) LIKE '%' || LOWER(e2.name) || '%'
OR LOWER(e2.name) LIKE '%' || LOWER(e1.name) || '%'
THEN 'substring'
ELSE 'fuzzy'
END as match_type
FROM zopk_knowledge_entities e1
JOIN zopk_knowledge_entities e2
ON e1.id < e2.id
AND e1.entity_type = e2.entity_type
WHERE (
similarity(LOWER(e1.name), LOWER(e2.name)) > :min_sim
OR LOWER(e1.name) LIKE '%' || LOWER(e2.name) || '%'
OR LOWER(e2.name) LIKE '%' || LOWER(e1.name) || '%'
)
{type_filter}
ORDER BY
e1.entity_type,
GREATEST(e1.mentions_count, e2.mentions_count) DESC,
sim DESC
LIMIT :limit
""")
result = db_session.execute(query, {'min_sim': min_similarity, 'limit': limit})
duplicates = []
for row in result:
duplicates.append({
'entity1': {
'id': row.id1,
'name': row.name1,
'entity_type': row.type1,
'mentions_count': row.mentions1,
'is_verified': row.verified1
},
'entity2': {
'id': row.id2,
'name': row.name2,
'entity_type': row.type2,
'mentions_count': row.mentions2,
'is_verified': row.verified2
},
'similarity': float(row.sim) if row.sim else 0.0,
'match_type': row.match_type
})
return duplicates
def merge_entities(
db_session,
primary_id: int,
duplicate_id: int,
new_name: Optional[str] = None
) -> Dict:
"""
Merge two entities - keep primary, delete duplicate.
Transfers all relationships from duplicate to primary:
- Entity mentions
- Facts (subject/object references)
- Relations (source/target)
- Updates mentions_count
Args:
db_session: SQLAlchemy session
primary_id: ID of entity to keep
duplicate_id: ID of entity to merge and delete
new_name: Optional new canonical name for primary
Returns:
Dict with merge results:
{
'success': True,
'primary_id': 123,
'deleted_id': 456,
'transfers': {
'mentions': 15,
'facts_subject': 3,
'facts_object': 2,
'relations_source': 1,
'relations_target': 0
}
}
"""
from sqlalchemy import text
# Get both entities
primary = db_session.query(ZOPKKnowledgeEntity).get(primary_id)
duplicate = db_session.query(ZOPKKnowledgeEntity).get(duplicate_id)
if not primary:
return {'success': False, 'error': f'Primary entity {primary_id} not found'}
if not duplicate:
return {'success': False, 'error': f'Duplicate entity {duplicate_id} not found'}
if primary.entity_type != duplicate.entity_type:
return {'success': False, 'error': 'Cannot merge entities of different types'}
transfers = {
'mentions': 0,
'facts_subject': 0,
'facts_object': 0,
'relations_source': 0,
'relations_target': 0
}
try:
# 1. Transfer mentions
result = db_session.execute(text("""
UPDATE zopk_knowledge_entity_mentions
SET entity_id = :primary_id
WHERE entity_id = :duplicate_id
"""), {'primary_id': primary_id, 'duplicate_id': duplicate_id})
transfers['mentions'] = result.rowcount
# 2. Transfer facts (subject)
result = db_session.execute(text("""
UPDATE zopk_knowledge_facts
SET subject_entity_id = :primary_id
WHERE subject_entity_id = :duplicate_id
"""), {'primary_id': primary_id, 'duplicate_id': duplicate_id})
transfers['facts_subject'] = result.rowcount
# 3. Transfer facts (object)
result = db_session.execute(text("""
UPDATE zopk_knowledge_facts
SET object_entity_id = :primary_id
WHERE object_entity_id = :duplicate_id
"""), {'primary_id': primary_id, 'duplicate_id': duplicate_id})
transfers['facts_object'] = result.rowcount
# 4. Transfer relations (source)
result = db_session.execute(text("""
UPDATE zopk_knowledge_relations
SET source_entity_id = :primary_id
WHERE source_entity_id = :duplicate_id
"""), {'primary_id': primary_id, 'duplicate_id': duplicate_id})
transfers['relations_source'] = result.rowcount
# 5. Transfer relations (target)
result = db_session.execute(text("""
UPDATE zopk_knowledge_relations
SET target_entity_id = :primary_id
WHERE target_entity_id = :duplicate_id
"""), {'primary_id': primary_id, 'duplicate_id': duplicate_id})
transfers['relations_target'] = result.rowcount
# 6. Update primary entity
primary.mentions_count += duplicate.mentions_count
if new_name:
primary.canonical_name = new_name
# Merge aliases
if duplicate.aliases:
existing_aliases = primary.aliases or []
new_aliases = duplicate.aliases
# Add duplicate name as alias
if duplicate.name not in existing_aliases:
existing_aliases.append(duplicate.name)
# Add duplicate's aliases
for alias in new_aliases:
if alias not in existing_aliases:
existing_aliases.append(alias)
primary.aliases = existing_aliases
# 7. Delete duplicate
db_session.delete(duplicate)
db_session.commit()
return {
'success': True,
'primary_id': primary_id,
'deleted_id': duplicate_id,
'new_mentions_count': primary.mentions_count,
'transfers': transfers
}
except Exception as e:
db_session.rollback()
logger.error(f"Error merging entities: {e}")
return {'success': False, 'error': str(e)}
def get_entity_merge_preview(
db_session,
primary_id: int,
duplicate_id: int
) -> Dict:
"""
Preview what would happen if two entities are merged.
Returns counts of items that would be transferred.
"""
from sqlalchemy import text, func
primary = db_session.query(ZOPKKnowledgeEntity).get(primary_id)
duplicate = db_session.query(ZOPKKnowledgeEntity).get(duplicate_id)
if not primary or not duplicate:
return {'error': 'Entity not found'}
# Count items that would be transferred
mentions = db_session.query(func.count(ZOPKKnowledgeEntityMention.id)).filter(
ZOPKKnowledgeEntityMention.entity_id == duplicate_id
).scalar() or 0
facts_subject = db_session.query(func.count(ZOPKKnowledgeFact.id)).filter(
ZOPKKnowledgeFact.subject_entity_id == duplicate_id
).scalar() or 0
facts_object = db_session.query(func.count(ZOPKKnowledgeFact.id)).filter(
ZOPKKnowledgeFact.object_entity_id == duplicate_id
).scalar() or 0
relations_source = db_session.query(func.count(ZOPKKnowledgeRelation.id)).filter(
ZOPKKnowledgeRelation.source_entity_id == duplicate_id
).scalar() or 0
relations_target = db_session.query(func.count(ZOPKKnowledgeRelation.id)).filter(
ZOPKKnowledgeRelation.target_entity_id == duplicate_id
).scalar() or 0
return {
'primary': {
'id': primary.id,
'name': primary.name,
'entity_type': primary.entity_type,
'mentions_count': primary.mentions_count,
'aliases': primary.aliases or []
},
'duplicate': {
'id': duplicate.id,
'name': duplicate.name,
'entity_type': duplicate.entity_type,
'mentions_count': duplicate.mentions_count,
'aliases': duplicate.aliases or []
},
'transfers': {
'mentions': mentions,
'facts_subject': facts_subject,
'facts_object': facts_object,
'relations_source': relations_source,
'relations_target': relations_target,
'total': mentions + facts_subject + facts_object + relations_source + relations_target
},
'result': {
'new_mentions_count': primary.mentions_count + duplicate.mentions_count
}
}