fix(workflow): extract bulk admissions from decisions, raise pg_trgm threshold to 0.5
Some checks are pending
NordaBiz Tests / Unit & Integration Tests (push) Waiting to run
NordaBiz Tests / E2E Tests (Playwright) (push) Blocked by required conditions
NordaBiz Tests / Smoke Tests (Production) (push) Blocked by required conditions
NordaBiz Tests / Send Failure Notification (push) Blocked by required conditions

Meeting 2 had 5 companies in one agenda item with separate decisions.
Old code only extracted from title. Now also parses each decision for
"Przyjęto firmę X jako" pattern. Raised similarity threshold from 0.3
to 0.5 to avoid false positives (e.g. "Konkol Sp. z o.o." matching "INPI Sp. z o.o.").

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Maciej Pienczyn 2026-04-06 23:59:03 +02:00
parent 90203676e5
commit 5faa089ce7

View File

@ -167,9 +167,8 @@ def extract_admitted_companies(proceedings: list) -> list:
if isinstance(decisions, str): if isinstance(decisions, str):
decisions = [decisions] decisions = [decisions]
# Check if this is an admission proceeding # Collect all admission decisions from this proceeding
is_admission = False admission_decisions = []
decision_text = ''
for d in decisions: for d in decisions:
d_lower = d.lower() d_lower = d.lower()
if ('przyjęt' in d_lower and 'jednogłośnie' in d_lower if ('przyjęt' in d_lower and 'jednogłośnie' in d_lower
@ -177,22 +176,15 @@ def extract_admitted_companies(proceedings: list) -> list:
and 'program' not in d_lower and 'program' not in d_lower
and 'protokół' not in d_lower and 'protokół' not in d_lower
and 'protokol' not in d_lower): and 'protokol' not in d_lower):
is_admission = True admission_decisions.append(d)
decision_text = d
break
if not is_admission: if not admission_decisions:
continue continue
# Extract company name from title # Try to extract company name from title first
# Pattern 1: "Prezentacja firmy X -- kandydat na czlonka Izby" # Pattern 1: "Prezentacja firmy X — kandydat na członka Izby"
# Pattern 2: "Prezentacja firmy X - kandydat na czlonka Izby" # Pattern 2: "Prezentacja: X coach/mentoring (kandydatka na członka Izby)"
# Pattern 3: "Prezentacja: X -- coach/mentoring (kandydatka na czlonka Izby)" title_name = None
# Pattern 4: "Prezentacja i glosowanie nad kandydatami..." (bulk - extract from decisions)
company_name = None
# Try title patterns
for pattern in [ for pattern in [
r'[Pp]rezentacja\s+firmy\s+(.+?)\s*[—–\-]\s*kandydat', r'[Pp]rezentacja\s+firmy\s+(.+?)\s*[—–\-]\s*kandydat',
r'[Pp]rezentacja:\s+(.+?)\s*[—–\-]\s*', r'[Pp]rezentacja:\s+(.+?)\s*[—–\-]\s*',
@ -200,25 +192,30 @@ def extract_admitted_companies(proceedings: list) -> list:
]: ]:
match = re.search(pattern, title) match = re.search(pattern, title)
if match: if match:
company_name = match.group(1).strip() title_name = match.group(1).strip().rstrip('.')
break break
# If no match from title, try to extract from decision text if title_name:
# Pattern: "Przyjeto jednoglosnie firme X jako nowego czlonka Izby" # Single company from title — use first admission decision
if not company_name:
match = re.search(r'[Pp]rzyjęt[oa]\s+jednogłośnie\s+firmę\s+(.+?)\s+jako', decision_text)
if match:
company_name = match.group(1).strip()
if company_name:
# Clean up: remove trailing dots, Sp. z o.o. standardization
company_name = company_name.rstrip('.')
results.append({ results.append({
'title': title, 'title': title,
'extracted_name': company_name, 'extracted_name': title_name,
'decision_text': decision_text, 'decision_text': admission_decisions[0],
'proceeding_index': i 'proceeding_index': i
}) })
else:
# Bulk admission — extract company names from each decision
# Pattern: "Przyjęto jednogłośnie firmę X jako nowego członka Izby"
for d in admission_decisions:
match = re.search(r'[Pp]rzyjęt[oa]\s+jednogłośnie\s+firmę\s+(.+?)\s+jako', d)
if match:
company_name = match.group(1).strip().rstrip('.')
results.append({
'title': title,
'extracted_name': company_name,
'decision_text': d,
'proceeding_index': i
})
return results return results
@ -253,7 +250,7 @@ def match_company_by_name(db, name: str) -> tuple:
# 4. pg_trgm similarity (if extension available) # 4. pg_trgm similarity (if extension available)
try: try:
result = db.execute( result = db.execute(
text("SELECT id, name, similarity(name, :name) as sim FROM companies WHERE similarity(name, :name) > 0.3 ORDER BY sim DESC LIMIT 1"), text("SELECT id, name, similarity(name, :name) as sim FROM companies WHERE similarity(name, :name) > 0.5 ORDER BY sim DESC LIMIT 1"),
{'name': name} {'name': name}
).first() ).first()
if result: if result: