fix(zopk): Użycie operatora % z indeksem GiST dla deduplikacji faktów

Zapytanie similarity() bez indeksu powodowało timeout przy 3414 faktach.
Teraz używamy SET pg_trgm.similarity_threshold + operator % który
wykorzystuje indeks GiST (idx_facts_fulltext_trgm).

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Maciej Pienczyn 2026-01-17 11:09:01 +01:00
parent 96fa0058c2
commit b3249f5b22

View File

@ -1964,27 +1964,33 @@ def find_duplicate_facts(
limit: int = 100,
fact_type: Optional[str] = None
) -> List[Dict]:
"""Find potential duplicate facts using text similarity."""
"""Find potential duplicate facts using text similarity.
Uses pg_trgm % operator with GiST index for fast similarity search.
"""
from sqlalchemy import text
type_filter = f"AND f1.fact_type = '{fact_type}'" if fact_type else ""
# Set similarity threshold and use % operator (uses GiST index)
db_session.execute(text("SET pg_trgm.similarity_threshold = :threshold"), {'threshold': min_similarity})
query = text(f"""
SELECT
f1.id as id1, f1.full_text as text1, f1.fact_type as type1,
f1.is_verified as verified1, f1.importance_score as score1,
f2.id as id2, f2.full_text as text2, f2.fact_type as type2,
f2.is_verified as verified2, f2.importance_score as score2,
similarity(LOWER(f1.full_text), LOWER(f2.full_text)) as sim
similarity(f1.full_text, f2.full_text) as sim
FROM zopk_knowledge_facts f1
JOIN zopk_knowledge_facts f2 ON f1.id < f2.id
WHERE similarity(LOWER(f1.full_text), LOWER(f2.full_text)) >= :min_sim
WHERE f1.full_text % f2.full_text
{type_filter}
ORDER BY sim DESC, GREATEST(f1.importance_score, f2.importance_score) DESC
LIMIT :limit
""")
result = db_session.execute(query, {'min_sim': min_similarity, 'limit': limit})
result = db_session.execute(query, {'limit': limit})
duplicates = []
for row in result: