feat: Add web search to AI enrichment
- Search Brave API for company news and web results - Fetch company website content with BeautifulSoup - Combine all sources (database, Brave, website) in AI prompt - Return source usage info in API response
This commit is contained in:
parent
fdbe2d461b
commit
cf31842287
222
app.py
222
app.py
@ -5861,12 +5861,143 @@ def api_refresh_company_krs(company_id):
|
||||
db.close()
|
||||
|
||||
|
||||
def _search_brave_for_company(company_name: str, city: str = None) -> dict:
|
||||
"""
|
||||
Search Brave API for company information.
|
||||
Returns dict with news items and web results.
|
||||
"""
|
||||
import requests
|
||||
|
||||
brave_api_key = os.getenv('BRAVE_API_KEY')
|
||||
if not brave_api_key:
|
||||
logger.warning("BRAVE_API_KEY not configured, skipping web search")
|
||||
return {'news': [], 'web': []}
|
||||
|
||||
results = {'news': [], 'web': []}
|
||||
|
||||
# Build search query
|
||||
query = f'"{company_name}"'
|
||||
if city:
|
||||
query += f' {city}'
|
||||
|
||||
try:
|
||||
headers = {
|
||||
'Accept': 'application/json',
|
||||
'X-Subscription-Token': brave_api_key
|
||||
}
|
||||
|
||||
# Search news
|
||||
news_params = {
|
||||
'q': query,
|
||||
'count': 5,
|
||||
'freshness': 'py', # past year
|
||||
'country': 'pl',
|
||||
'search_lang': 'pl'
|
||||
}
|
||||
|
||||
news_response = requests.get(
|
||||
'https://api.search.brave.com/res/v1/news/search',
|
||||
headers=headers,
|
||||
params=news_params,
|
||||
timeout=10
|
||||
)
|
||||
|
||||
if news_response.status_code == 200:
|
||||
news_data = news_response.json()
|
||||
for item in news_data.get('results', [])[:5]:
|
||||
results['news'].append({
|
||||
'title': item.get('title', ''),
|
||||
'description': item.get('description', ''),
|
||||
'url': item.get('url', ''),
|
||||
'source': item.get('meta_url', {}).get('hostname', '')
|
||||
})
|
||||
logger.info(f"Brave News: found {len(results['news'])} items for '{company_name}'")
|
||||
|
||||
# Search web
|
||||
web_params = {
|
||||
'q': query,
|
||||
'count': 5,
|
||||
'country': 'pl',
|
||||
'search_lang': 'pl'
|
||||
}
|
||||
|
||||
web_response = requests.get(
|
||||
'https://api.search.brave.com/res/v1/web/search',
|
||||
headers=headers,
|
||||
params=web_params,
|
||||
timeout=10
|
||||
)
|
||||
|
||||
if web_response.status_code == 200:
|
||||
web_data = web_response.json()
|
||||
for item in web_data.get('web', {}).get('results', [])[:5]:
|
||||
results['web'].append({
|
||||
'title': item.get('title', ''),
|
||||
'description': item.get('description', ''),
|
||||
'url': item.get('url', '')
|
||||
})
|
||||
logger.info(f"Brave Web: found {len(results['web'])} items for '{company_name}'")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Brave search error for '{company_name}': {e}")
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def _fetch_website_content(url: str) -> str:
|
||||
"""
|
||||
Fetch and extract text content from company website.
|
||||
Returns first 2000 chars of text content.
|
||||
"""
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
if not url:
|
||||
return ''
|
||||
|
||||
try:
|
||||
# Ensure URL has protocol
|
||||
if not url.startswith('http'):
|
||||
url = 'https://' + url
|
||||
|
||||
response = requests.get(url, timeout=10, headers={
|
||||
'User-Agent': 'Mozilla/5.0 (compatible; NordaBizBot/1.0)'
|
||||
})
|
||||
|
||||
if response.status_code == 200:
|
||||
soup = BeautifulSoup(response.text, 'html.parser')
|
||||
|
||||
# Remove scripts and styles
|
||||
for tag in soup(['script', 'style', 'nav', 'footer', 'header']):
|
||||
tag.decompose()
|
||||
|
||||
# Get text content
|
||||
text = soup.get_text(separator=' ', strip=True)
|
||||
|
||||
# Clean up whitespace
|
||||
text = ' '.join(text.split())
|
||||
|
||||
logger.info(f"Fetched {len(text)} chars from {url}")
|
||||
return text[:3000] # Limit to 3000 chars
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to fetch website content from {url}: {e}")
|
||||
|
||||
return ''
|
||||
|
||||
|
||||
@app.route('/api/company/<int:company_id>/enrich-ai', methods=['POST'])
|
||||
@login_required
|
||||
@limiter.limit("5 per hour")
|
||||
def api_enrich_company_ai(company_id):
|
||||
"""
|
||||
API: Enrich company data using AI (Gemini).
|
||||
API: Enrich company data using AI (Gemini) with web search.
|
||||
|
||||
Process:
|
||||
1. Search Brave API for company news and web results
|
||||
2. Fetch content from company website
|
||||
3. Combine with existing database data
|
||||
4. Send to Gemini for AI-powered enrichment
|
||||
|
||||
Generates AI insights including:
|
||||
- Business summary
|
||||
@ -5907,15 +6038,45 @@ def api_enrich_company_ai(company_id):
|
||||
'error': 'Usluga AI jest niedostepna. Skontaktuj sie z administratorem.'
|
||||
}), 503
|
||||
|
||||
# Collect existing company data for context
|
||||
# Get services from relationship or services_offered field
|
||||
logger.info(f"AI enrichment triggered by {current_user.email} for company: {company.name} (ID: {company.id})")
|
||||
|
||||
# ============================================
|
||||
# STEP 1: Search the web for company info
|
||||
# ============================================
|
||||
brave_results = _search_brave_for_company(company.name, company.address_city)
|
||||
|
||||
# Format news for prompt
|
||||
news_text = ""
|
||||
if brave_results['news']:
|
||||
news_text = "\n".join([
|
||||
f"- {item['title']}: {item['description'][:200]}"
|
||||
for item in brave_results['news'][:3]
|
||||
])
|
||||
|
||||
# Format web results for prompt
|
||||
web_text = ""
|
||||
if brave_results['web']:
|
||||
web_text = "\n".join([
|
||||
f"- {item['title']}: {item['description'][:200]}"
|
||||
for item in brave_results['web'][:3]
|
||||
])
|
||||
|
||||
# ============================================
|
||||
# STEP 2: Fetch company website content
|
||||
# ============================================
|
||||
website_content = ""
|
||||
if company.website:
|
||||
website_content = _fetch_website_content(company.website)
|
||||
|
||||
# ============================================
|
||||
# STEP 3: Collect existing company data
|
||||
# ============================================
|
||||
services_list = []
|
||||
if company.services:
|
||||
services_list = [cs.service.name for cs in company.services if cs.service]
|
||||
elif company.services_offered:
|
||||
services_list = [company.services_offered]
|
||||
|
||||
# Get competencies from relationship
|
||||
competencies_list = []
|
||||
if company.competencies:
|
||||
competencies_list = [cc.competency.name for cc in company.competencies if cc.competency]
|
||||
@ -5927,48 +6088,57 @@ def api_enrich_company_ai(company_id):
|
||||
'kategoria': company.category.name if company.category else '',
|
||||
'uslugi': ', '.join(services_list) if services_list else '',
|
||||
'kompetencje': ', '.join(competencies_list) if competencies_list else '',
|
||||
'slowa_kluczowe': '', # keywords not stored in Company model
|
||||
'wyrozniki': '', # usp not stored in Company model
|
||||
'certyfikaty': '', # certifications is a relationship
|
||||
'wartosci': company.core_values or '',
|
||||
'strona_www': company.website or '',
|
||||
'miasto': company.address_city or '',
|
||||
'branza': company.pkd_description or ''
|
||||
}
|
||||
|
||||
# Build prompt for AI
|
||||
prompt = f"""Przeanalizuj dane polskiej firmy i wygeneruj wzbogacone informacje.
|
||||
# ============================================
|
||||
# STEP 4: Build comprehensive prompt for AI
|
||||
# ============================================
|
||||
prompt = f"""Przeanalizuj wszystkie dostepne dane o polskiej firmie i wygeneruj wzbogacone informacje.
|
||||
|
||||
DANE FIRMY:
|
||||
=== DANE Z BAZY DANYCH ===
|
||||
Nazwa: {existing_data['nazwa']}
|
||||
Kategoria: {existing_data['kategoria']}
|
||||
Opis krotki: {existing_data['opis_krotki']}
|
||||
Opis pelny: {existing_data['opis_pelny']}
|
||||
Uslugi: {existing_data['uslugi']}
|
||||
Kompetencje: {existing_data['kompetencje']}
|
||||
Slowa kluczowe: {existing_data['slowa_kluczowe']}
|
||||
Wyrozniki: {existing_data['wyrozniki']}
|
||||
Certyfikaty: {existing_data['certyfikaty']}
|
||||
Wartosci firmy: {existing_data['wartosci']}
|
||||
Strona WWW: {existing_data['strona_www']}
|
||||
Miasto: {existing_data['miasto']}
|
||||
Branza (PKD): {existing_data['branza']}
|
||||
|
||||
ZADANIE:
|
||||
Na podstawie powyzszych danych wygeneruj wzbogacone informacje o firmie w formacie JSON.
|
||||
Jesli brakuje danych, wygeneruj rozsadne propozycje na podstawie nazwy i kategorii firmy.
|
||||
=== INFORMACJE Z INTERNETU (Brave Search) ===
|
||||
Newsy o firmie:
|
||||
{news_text if news_text else '(brak znalezionych newsow)'}
|
||||
|
||||
Wyniki wyszukiwania:
|
||||
{web_text if web_text else '(brak wynikow)'}
|
||||
|
||||
=== TRESC ZE STRONY WWW FIRMY ===
|
||||
{website_content[:2000] if website_content else '(nie udalo sie pobrac tresci strony)'}
|
||||
|
||||
=== ZADANIE ===
|
||||
Na podstawie WSZYSTKICH powyzszych danych (baza danych, wyszukiwarka, strona WWW) wygeneruj wzbogacone informacje o firmie.
|
||||
Wykorzystaj informacje z internetu do uzupelnienia brakujacych danych.
|
||||
Jesli znalazles nowe uslugi, certyfikaty lub informacje - dodaj je do odpowiedzi.
|
||||
|
||||
Odpowiedz WYLACZNIE w formacie JSON (bez dodatkowego tekstu):
|
||||
{{
|
||||
"business_summary": "Zwiezly opis dzialalnosci firmy (2-3 zdania)",
|
||||
"services_list": ["usluga1", "usluga2", "usluga3"],
|
||||
"business_summary": "Zwiezly opis dzialalnosci firmy (2-3 zdania) na podstawie wszystkich zrodel",
|
||||
"services_list": ["usluga1", "usluga2", "usluga3", "usluga4", "usluga5"],
|
||||
"target_market": "Opis grupy docelowej klientow",
|
||||
"unique_selling_points": ["wyroznik1", "wyroznik2", "wyroznik3"],
|
||||
"company_values": ["wartosc1", "wartosc2", "wartosc3"],
|
||||
"certifications": ["certyfikat1", "certyfikat2"],
|
||||
"industry_tags": ["tag1", "tag2", "tag3", "tag4", "tag5"],
|
||||
"recent_news": "Krotkie podsumowanie ostatnich newsow o firmie (jesli sa)",
|
||||
"suggested_category": "Sugerowana kategoria glowna",
|
||||
"category_confidence": 0.85
|
||||
"category_confidence": 0.85,
|
||||
"data_sources_used": ["database", "brave_search", "website"]
|
||||
}}
|
||||
|
||||
WAZNE:
|
||||
@ -5976,10 +6146,10 @@ WAZNE:
|
||||
- Wszystkie teksty po polsku
|
||||
- Listy powinny zawierac 3-5 elementow
|
||||
- category_confidence to liczba od 0 do 1
|
||||
- Wykorzystaj maksymalnie informacje z internetu
|
||||
"""
|
||||
|
||||
# Call Gemini API
|
||||
logger.info(f"AI enrichment triggered by {current_user.email} for company: {company.name} (ID: {company.id})")
|
||||
|
||||
start_time = time.time()
|
||||
response_text = service.generate_text(
|
||||
@ -6049,12 +6219,22 @@ WAZNE:
|
||||
|
||||
db.commit()
|
||||
|
||||
logger.info(f"AI enrichment completed for {company.name}. Processing time: {processing_time}ms")
|
||||
# Count sources used
|
||||
sources_used = ['database']
|
||||
if brave_results['news'] or brave_results['web']:
|
||||
sources_used.append('brave_search')
|
||||
if website_content:
|
||||
sources_used.append('website')
|
||||
|
||||
logger.info(f"AI enrichment completed for {company.name}. Processing time: {processing_time}ms. Sources: {sources_used}")
|
||||
|
||||
return jsonify({
|
||||
'success': True,
|
||||
'message': f'Dane firmy "{company.name}" zostaly wzbogacone przez AI',
|
||||
'processing_time_ms': processing_time,
|
||||
'sources_used': sources_used,
|
||||
'brave_results_count': len(brave_results['news']) + len(brave_results['web']),
|
||||
'website_content_length': len(website_content),
|
||||
'insights': ai_data
|
||||
})
|
||||
|
||||
|
||||
Loading…
Reference in New Issue
Block a user