feat: Add web search to AI enrichment

- Search Brave API for company news and web results
- Fetch company website content with BeautifulSoup
- Combine all sources (database, Brave, website) in AI prompt
- Return source usage info in API response
This commit is contained in:
Maciej Pienczyn 2026-01-13 18:36:52 +01:00
parent fdbe2d461b
commit cf31842287

222
app.py
View File

@ -5861,12 +5861,143 @@ def api_refresh_company_krs(company_id):
db.close()
def _search_brave_for_company(company_name: str, city: str = None) -> dict:
"""
Search Brave API for company information.
Returns dict with news items and web results.
"""
import requests
brave_api_key = os.getenv('BRAVE_API_KEY')
if not brave_api_key:
logger.warning("BRAVE_API_KEY not configured, skipping web search")
return {'news': [], 'web': []}
results = {'news': [], 'web': []}
# Build search query
query = f'"{company_name}"'
if city:
query += f' {city}'
try:
headers = {
'Accept': 'application/json',
'X-Subscription-Token': brave_api_key
}
# Search news
news_params = {
'q': query,
'count': 5,
'freshness': 'py', # past year
'country': 'pl',
'search_lang': 'pl'
}
news_response = requests.get(
'https://api.search.brave.com/res/v1/news/search',
headers=headers,
params=news_params,
timeout=10
)
if news_response.status_code == 200:
news_data = news_response.json()
for item in news_data.get('results', [])[:5]:
results['news'].append({
'title': item.get('title', ''),
'description': item.get('description', ''),
'url': item.get('url', ''),
'source': item.get('meta_url', {}).get('hostname', '')
})
logger.info(f"Brave News: found {len(results['news'])} items for '{company_name}'")
# Search web
web_params = {
'q': query,
'count': 5,
'country': 'pl',
'search_lang': 'pl'
}
web_response = requests.get(
'https://api.search.brave.com/res/v1/web/search',
headers=headers,
params=web_params,
timeout=10
)
if web_response.status_code == 200:
web_data = web_response.json()
for item in web_data.get('web', {}).get('results', [])[:5]:
results['web'].append({
'title': item.get('title', ''),
'description': item.get('description', ''),
'url': item.get('url', '')
})
logger.info(f"Brave Web: found {len(results['web'])} items for '{company_name}'")
except Exception as e:
logger.error(f"Brave search error for '{company_name}': {e}")
return results
def _fetch_website_content(url: str) -> str:
"""
Fetch and extract text content from company website.
Returns first 2000 chars of text content.
"""
import requests
from bs4 import BeautifulSoup
if not url:
return ''
try:
# Ensure URL has protocol
if not url.startswith('http'):
url = 'https://' + url
response = requests.get(url, timeout=10, headers={
'User-Agent': 'Mozilla/5.0 (compatible; NordaBizBot/1.0)'
})
if response.status_code == 200:
soup = BeautifulSoup(response.text, 'html.parser')
# Remove scripts and styles
for tag in soup(['script', 'style', 'nav', 'footer', 'header']):
tag.decompose()
# Get text content
text = soup.get_text(separator=' ', strip=True)
# Clean up whitespace
text = ' '.join(text.split())
logger.info(f"Fetched {len(text)} chars from {url}")
return text[:3000] # Limit to 3000 chars
except Exception as e:
logger.warning(f"Failed to fetch website content from {url}: {e}")
return ''
@app.route('/api/company/<int:company_id>/enrich-ai', methods=['POST'])
@login_required
@limiter.limit("5 per hour")
def api_enrich_company_ai(company_id):
"""
API: Enrich company data using AI (Gemini).
API: Enrich company data using AI (Gemini) with web search.
Process:
1. Search Brave API for company news and web results
2. Fetch content from company website
3. Combine with existing database data
4. Send to Gemini for AI-powered enrichment
Generates AI insights including:
- Business summary
@ -5907,15 +6038,45 @@ def api_enrich_company_ai(company_id):
'error': 'Usluga AI jest niedostepna. Skontaktuj sie z administratorem.'
}), 503
# Collect existing company data for context
# Get services from relationship or services_offered field
logger.info(f"AI enrichment triggered by {current_user.email} for company: {company.name} (ID: {company.id})")
# ============================================
# STEP 1: Search the web for company info
# ============================================
brave_results = _search_brave_for_company(company.name, company.address_city)
# Format news for prompt
news_text = ""
if brave_results['news']:
news_text = "\n".join([
f"- {item['title']}: {item['description'][:200]}"
for item in brave_results['news'][:3]
])
# Format web results for prompt
web_text = ""
if brave_results['web']:
web_text = "\n".join([
f"- {item['title']}: {item['description'][:200]}"
for item in brave_results['web'][:3]
])
# ============================================
# STEP 2: Fetch company website content
# ============================================
website_content = ""
if company.website:
website_content = _fetch_website_content(company.website)
# ============================================
# STEP 3: Collect existing company data
# ============================================
services_list = []
if company.services:
services_list = [cs.service.name for cs in company.services if cs.service]
elif company.services_offered:
services_list = [company.services_offered]
# Get competencies from relationship
competencies_list = []
if company.competencies:
competencies_list = [cc.competency.name for cc in company.competencies if cc.competency]
@ -5927,48 +6088,57 @@ def api_enrich_company_ai(company_id):
'kategoria': company.category.name if company.category else '',
'uslugi': ', '.join(services_list) if services_list else '',
'kompetencje': ', '.join(competencies_list) if competencies_list else '',
'slowa_kluczowe': '', # keywords not stored in Company model
'wyrozniki': '', # usp not stored in Company model
'certyfikaty': '', # certifications is a relationship
'wartosci': company.core_values or '',
'strona_www': company.website or '',
'miasto': company.address_city or '',
'branza': company.pkd_description or ''
}
# Build prompt for AI
prompt = f"""Przeanalizuj dane polskiej firmy i wygeneruj wzbogacone informacje.
# ============================================
# STEP 4: Build comprehensive prompt for AI
# ============================================
prompt = f"""Przeanalizuj wszystkie dostepne dane o polskiej firmie i wygeneruj wzbogacone informacje.
DANE FIRMY:
=== DANE Z BAZY DANYCH ===
Nazwa: {existing_data['nazwa']}
Kategoria: {existing_data['kategoria']}
Opis krotki: {existing_data['opis_krotki']}
Opis pelny: {existing_data['opis_pelny']}
Uslugi: {existing_data['uslugi']}
Kompetencje: {existing_data['kompetencje']}
Slowa kluczowe: {existing_data['slowa_kluczowe']}
Wyrozniki: {existing_data['wyrozniki']}
Certyfikaty: {existing_data['certyfikaty']}
Wartosci firmy: {existing_data['wartosci']}
Strona WWW: {existing_data['strona_www']}
Miasto: {existing_data['miasto']}
Branza (PKD): {existing_data['branza']}
ZADANIE:
Na podstawie powyzszych danych wygeneruj wzbogacone informacje o firmie w formacie JSON.
Jesli brakuje danych, wygeneruj rozsadne propozycje na podstawie nazwy i kategorii firmy.
=== INFORMACJE Z INTERNETU (Brave Search) ===
Newsy o firmie:
{news_text if news_text else '(brak znalezionych newsow)'}
Wyniki wyszukiwania:
{web_text if web_text else '(brak wynikow)'}
=== TRESC ZE STRONY WWW FIRMY ===
{website_content[:2000] if website_content else '(nie udalo sie pobrac tresci strony)'}
=== ZADANIE ===
Na podstawie WSZYSTKICH powyzszych danych (baza danych, wyszukiwarka, strona WWW) wygeneruj wzbogacone informacje o firmie.
Wykorzystaj informacje z internetu do uzupelnienia brakujacych danych.
Jesli znalazles nowe uslugi, certyfikaty lub informacje - dodaj je do odpowiedzi.
Odpowiedz WYLACZNIE w formacie JSON (bez dodatkowego tekstu):
{{
"business_summary": "Zwiezly opis dzialalnosci firmy (2-3 zdania)",
"services_list": ["usluga1", "usluga2", "usluga3"],
"business_summary": "Zwiezly opis dzialalnosci firmy (2-3 zdania) na podstawie wszystkich zrodel",
"services_list": ["usluga1", "usluga2", "usluga3", "usluga4", "usluga5"],
"target_market": "Opis grupy docelowej klientow",
"unique_selling_points": ["wyroznik1", "wyroznik2", "wyroznik3"],
"company_values": ["wartosc1", "wartosc2", "wartosc3"],
"certifications": ["certyfikat1", "certyfikat2"],
"industry_tags": ["tag1", "tag2", "tag3", "tag4", "tag5"],
"recent_news": "Krotkie podsumowanie ostatnich newsow o firmie (jesli sa)",
"suggested_category": "Sugerowana kategoria glowna",
"category_confidence": 0.85
"category_confidence": 0.85,
"data_sources_used": ["database", "brave_search", "website"]
}}
WAZNE:
@ -5976,10 +6146,10 @@ WAZNE:
- Wszystkie teksty po polsku
- Listy powinny zawierac 3-5 elementow
- category_confidence to liczba od 0 do 1
- Wykorzystaj maksymalnie informacje z internetu
"""
# Call Gemini API
logger.info(f"AI enrichment triggered by {current_user.email} for company: {company.name} (ID: {company.id})")
start_time = time.time()
response_text = service.generate_text(
@ -6049,12 +6219,22 @@ WAZNE:
db.commit()
logger.info(f"AI enrichment completed for {company.name}. Processing time: {processing_time}ms")
# Count sources used
sources_used = ['database']
if brave_results['news'] or brave_results['web']:
sources_used.append('brave_search')
if website_content:
sources_used.append('website')
logger.info(f"AI enrichment completed for {company.name}. Processing time: {processing_time}ms. Sources: {sources_used}")
return jsonify({
'success': True,
'message': f'Dane firmy "{company.name}" zostaly wzbogacone przez AI',
'processing_time_ms': processing_time,
'sources_used': sources_used,
'brave_results_count': len(brave_results['news']) + len(brave_results['web']),
'website_content_length': len(website_content),
'insights': ai_data
})