fix(workflow): extract bulk admissions from decisions, raise pg_trgm threshold to 0.5
Some checks are pending
NordaBiz Tests / Unit & Integration Tests (push) Waiting to run
NordaBiz Tests / E2E Tests (Playwright) (push) Blocked by required conditions
NordaBiz Tests / Smoke Tests (Production) (push) Blocked by required conditions
NordaBiz Tests / Send Failure Notification (push) Blocked by required conditions

Meeting 2 had 5 companies in one agenda item with separate decisions.
Old code only extracted from title. Now also parses each decision for
"Przyjęto firmę X jako" pattern. Raised similarity threshold from 0.3
to 0.5 to avoid false positives (e.g. "Konkol Sp. z o.o." matching "INPI Sp. z o.o.").

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Maciej Pienczyn 2026-04-06 23:59:03 +02:00
parent 90203676e5
commit 5faa089ce7

View File

@ -167,9 +167,8 @@ def extract_admitted_companies(proceedings: list) -> list:
if isinstance(decisions, str):
decisions = [decisions]
# Check if this is an admission proceeding
is_admission = False
decision_text = ''
# Collect all admission decisions from this proceeding
admission_decisions = []
for d in decisions:
d_lower = d.lower()
if ('przyjęt' in d_lower and 'jednogłośnie' in d_lower
@ -177,22 +176,15 @@ def extract_admitted_companies(proceedings: list) -> list:
and 'program' not in d_lower
and 'protokół' not in d_lower
and 'protokol' not in d_lower):
is_admission = True
decision_text = d
break
admission_decisions.append(d)
if not is_admission:
if not admission_decisions:
continue
# Extract company name from title
# Pattern 1: "Prezentacja firmy X -- kandydat na czlonka Izby"
# Pattern 2: "Prezentacja firmy X - kandydat na czlonka Izby"
# Pattern 3: "Prezentacja: X -- coach/mentoring (kandydatka na czlonka Izby)"
# Pattern 4: "Prezentacja i glosowanie nad kandydatami..." (bulk - extract from decisions)
company_name = None
# Try title patterns
# Try to extract company name from title first
# Pattern 1: "Prezentacja firmy X — kandydat na członka Izby"
# Pattern 2: "Prezentacja: X coach/mentoring (kandydatka na członka Izby)"
title_name = None
for pattern in [
r'[Pp]rezentacja\s+firmy\s+(.+?)\s*[—–\-]\s*kandydat',
r'[Pp]rezentacja:\s+(.+?)\s*[—–\-]\s*',
@ -200,23 +192,28 @@ def extract_admitted_companies(proceedings: list) -> list:
]:
match = re.search(pattern, title)
if match:
company_name = match.group(1).strip()
title_name = match.group(1).strip().rstrip('.')
break
# If no match from title, try to extract from decision text
# Pattern: "Przyjeto jednoglosnie firme X jako nowego czlonka Izby"
if not company_name:
match = re.search(r'[Pp]rzyjęt[oa]\s+jednogłośnie\s+firmę\s+(.+?)\s+jako', decision_text)
if title_name:
# Single company from title — use first admission decision
results.append({
'title': title,
'extracted_name': title_name,
'decision_text': admission_decisions[0],
'proceeding_index': i
})
else:
# Bulk admission — extract company names from each decision
# Pattern: "Przyjęto jednogłośnie firmę X jako nowego członka Izby"
for d in admission_decisions:
match = re.search(r'[Pp]rzyjęt[oa]\s+jednogłośnie\s+firmę\s+(.+?)\s+jako', d)
if match:
company_name = match.group(1).strip()
if company_name:
# Clean up: remove trailing dots, Sp. z o.o. standardization
company_name = company_name.rstrip('.')
company_name = match.group(1).strip().rstrip('.')
results.append({
'title': title,
'extracted_name': company_name,
'decision_text': decision_text,
'decision_text': d,
'proceeding_index': i
})
@ -253,7 +250,7 @@ def match_company_by_name(db, name: str) -> tuple:
# 4. pg_trgm similarity (if extension available)
try:
result = db.execute(
text("SELECT id, name, similarity(name, :name) as sim FROM companies WHERE similarity(name, :name) > 0.3 ORDER BY sim DESC LIMIT 1"),
text("SELECT id, name, similarity(name, :name) as sim FROM companies WHERE similarity(name, :name) > 0.5 ORDER BY sim DESC LIMIT 1"),
{'name': name}
).first()
if result: