diff --git a/services/admission_workflow.py b/services/admission_workflow.py index 447ed6f..ab95653 100644 --- a/services/admission_workflow.py +++ b/services/admission_workflow.py @@ -167,9 +167,8 @@ def extract_admitted_companies(proceedings: list) -> list: if isinstance(decisions, str): decisions = [decisions] - # Check if this is an admission proceeding - is_admission = False - decision_text = '' + # Collect all admission decisions from this proceeding + admission_decisions = [] for d in decisions: d_lower = d.lower() if ('przyjęt' in d_lower and 'jednogłośnie' in d_lower @@ -177,22 +176,15 @@ def extract_admitted_companies(proceedings: list) -> list: and 'program' not in d_lower and 'protokół' not in d_lower and 'protokol' not in d_lower): - is_admission = True - decision_text = d - break + admission_decisions.append(d) - if not is_admission: + if not admission_decisions: continue - # Extract company name from title - # Pattern 1: "Prezentacja firmy X -- kandydat na czlonka Izby" - # Pattern 2: "Prezentacja firmy X - kandydat na czlonka Izby" - # Pattern 3: "Prezentacja: X -- coach/mentoring (kandydatka na czlonka Izby)" - # Pattern 4: "Prezentacja i glosowanie nad kandydatami..." (bulk - extract from decisions) - - company_name = None - - # Try title patterns + # Try to extract company name from title first + # Pattern 1: "Prezentacja firmy X — kandydat na członka Izby" + # Pattern 2: "Prezentacja: X – coach/mentoring (kandydatka na członka Izby)" + title_name = None for pattern in [ r'[Pp]rezentacja\s+firmy\s+(.+?)\s*[—–\-]\s*kandydat', r'[Pp]rezentacja:\s+(.+?)\s*[—–\-]\s*', @@ -200,25 +192,30 @@ def extract_admitted_companies(proceedings: list) -> list: ]: match = re.search(pattern, title) if match: - company_name = match.group(1).strip() + title_name = match.group(1).strip().rstrip('.') break - # If no match from title, try to extract from decision text - # Pattern: "Przyjeto jednoglosnie firme X jako nowego czlonka Izby" - if not company_name: - match = re.search(r'[Pp]rzyjęt[oa]\s+jednogłośnie\s+firmę\s+(.+?)\s+jako', decision_text) - if match: - company_name = match.group(1).strip() - - if company_name: - # Clean up: remove trailing dots, Sp. z o.o. standardization - company_name = company_name.rstrip('.') + if title_name: + # Single company from title — use first admission decision results.append({ 'title': title, - 'extracted_name': company_name, - 'decision_text': decision_text, + 'extracted_name': title_name, + 'decision_text': admission_decisions[0], 'proceeding_index': i }) + else: + # Bulk admission — extract company names from each decision + # Pattern: "Przyjęto jednogłośnie firmę X jako nowego członka Izby" + for d in admission_decisions: + match = re.search(r'[Pp]rzyjęt[oa]\s+jednogłośnie\s+firmę\s+(.+?)\s+jako', d) + if match: + company_name = match.group(1).strip().rstrip('.') + results.append({ + 'title': title, + 'extracted_name': company_name, + 'decision_text': d, + 'proceeding_index': i + }) return results @@ -253,7 +250,7 @@ def match_company_by_name(db, name: str) -> tuple: # 4. pg_trgm similarity (if extension available) try: result = db.execute( - text("SELECT id, name, similarity(name, :name) as sim FROM companies WHERE similarity(name, :name) > 0.3 ORDER BY sim DESC LIMIT 1"), + text("SELECT id, name, similarity(name, :name) as sim FROM companies WHERE similarity(name, :name) > 0.5 ORDER BY sim DESC LIMIT 1"), {'name': name} ).first() if result: