fix(zopk): Użycie operatora % z indeksem GiST dla deduplikacji faktów
Zapytanie similarity() bez indeksu powodowało timeout przy 3414 faktach. Teraz używamy SET pg_trgm.similarity_threshold + operator % który wykorzystuje indeks GiST (idx_facts_fulltext_trgm). Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
parent
96fa0058c2
commit
b3249f5b22
@ -1964,27 +1964,33 @@ def find_duplicate_facts(
|
||||
limit: int = 100,
|
||||
fact_type: Optional[str] = None
|
||||
) -> List[Dict]:
|
||||
"""Find potential duplicate facts using text similarity."""
|
||||
"""Find potential duplicate facts using text similarity.
|
||||
|
||||
Uses pg_trgm % operator with GiST index for fast similarity search.
|
||||
"""
|
||||
from sqlalchemy import text
|
||||
|
||||
type_filter = f"AND f1.fact_type = '{fact_type}'" if fact_type else ""
|
||||
|
||||
# Set similarity threshold and use % operator (uses GiST index)
|
||||
db_session.execute(text("SET pg_trgm.similarity_threshold = :threshold"), {'threshold': min_similarity})
|
||||
|
||||
query = text(f"""
|
||||
SELECT
|
||||
f1.id as id1, f1.full_text as text1, f1.fact_type as type1,
|
||||
f1.is_verified as verified1, f1.importance_score as score1,
|
||||
f2.id as id2, f2.full_text as text2, f2.fact_type as type2,
|
||||
f2.is_verified as verified2, f2.importance_score as score2,
|
||||
similarity(LOWER(f1.full_text), LOWER(f2.full_text)) as sim
|
||||
similarity(f1.full_text, f2.full_text) as sim
|
||||
FROM zopk_knowledge_facts f1
|
||||
JOIN zopk_knowledge_facts f2 ON f1.id < f2.id
|
||||
WHERE similarity(LOWER(f1.full_text), LOWER(f2.full_text)) >= :min_sim
|
||||
WHERE f1.full_text % f2.full_text
|
||||
{type_filter}
|
||||
ORDER BY sim DESC, GREATEST(f1.importance_score, f2.importance_score) DESC
|
||||
LIMIT :limit
|
||||
""")
|
||||
|
||||
result = db_session.execute(query, {'min_sim': min_similarity, 'limit': limit})
|
||||
result = db_session.execute(query, {'limit': limit})
|
||||
|
||||
duplicates = []
|
||||
for row in result:
|
||||
|
||||
Loading…
Reference in New Issue
Block a user