nordabiz/tests/ai_quality_evaluator.py

"""
AI Quality Evaluator for Norda Biznes Partner

Evaluates the quality of AI chat responses by:
1. Loading test cases from JSON
2. Sending queries to the chat engine
3. Extracting company names from responses
4. Comparing with expected companies
5. Calculating pass rate and metrics
"""

import json
import re
import sys
import os
from dataclasses import dataclass, field
from datetime import datetime
from typing import List, Dict, Optional, Set, Tuple
from pathlib import Path

# Add parent directory to path for imports
sys.path.insert(0, str(Path(__file__).parent.parent))

from database import SessionLocal, init_db, Company
from nordabiz_chat import NordaBizChatEngine
import gemini_service


@dataclass
class TestResult:
    """Result of a single test case evaluation"""
    test_id: str
    query: str
    expected_companies: List[str]
    found_companies: List[str]
    matched_companies: List[str]
    response_text: str
    score: float  # 0.0 to 1.0
    passed: bool
    execution_time_ms: int
    error: Optional[str] = None


@dataclass
class EvaluationReport:
    """Full evaluation report"""
    timestamp: datetime
    total_tests: int
    passed_tests: int
    failed_tests: int
    pass_rate: float
    average_score: float
    results: List[TestResult] = field(default_factory=list)
    summary_by_category: Dict[str, Dict] = field(default_factory=dict)


class AIQualityEvaluator:
    """Evaluates AI chat quality using test cases"""

    def __init__(self, test_cases_path: str = None):
        """
        Initialize evaluator

        Args:
            test_cases_path: Path to test cases JSON file
        """
        if test_cases_path is None:
            test_cases_path = Path(__file__).parent / "ai_quality_test_cases.json"

        self.test_cases_path = Path(test_cases_path)
        self.test_cases = self._load_test_cases()
        self.config = self.test_cases.get("evaluation_config", {})

        # Initialize database
        init_db()
        self.db = SessionLocal()

        # Initialize Gemini service
        gemini_service.init_gemini_service(model='3-flash')

        # Load all company names for matching
        self.all_companies = self._load_company_names()

    def _load_test_cases(self) -> Dict:
        """Load test cases from JSON file"""
        if not self.test_cases_path.exists():
            raise FileNotFoundError(f"Test cases file not found: {self.test_cases_path}")

        with open(self.test_cases_path, 'r', encoding='utf-8') as f:
            return json.load(f)

    def _load_company_names(self) -> Set[str]:
        """Load all company names from database"""
        companies = self.db.query(Company).filter_by(status='active').all()
        return {c.name for c in companies}

    def _extract_companies_from_response(self, response: str) -> List[str]:
        """
        Extract company names mentioned in the AI response

        Uses multiple strategies:
        1. Direct name matching
        2. Partial matching (company name appears in response)
        """
        found = []

        # Normalize response for matching
        response_lower = response.lower()

        for company_name in self.all_companies:
            # Check if company name appears in response
            if company_name.lower() in response_lower:
                found.append(company_name)
                continue

            # Check for partial match (first word of company name)
            first_word = company_name.split()[0].lower()
            if len(first_word) > 3 and first_word in response_lower:
                # Verify it's not a false positive
                if any(word.lower() == first_word for word in response.split()):
                    found.append(company_name)

        return found

    def _calculate_score(
        self,
        expected: List[str],
        found: List[str]
    ) -> Tuple[float, List[str]]:
        """
        Calculate match score

        Returns:
            Tuple of (score, list of matched companies)
        """
        if not expected:
            return 1.0, []

        expected_lower = {e.lower() for e in expected}
        found_lower = {f.lower() for f in found}

        # Find matches (case-insensitive, partial matching)
        matched = []
        for exp in expected:
            exp_lower = exp.lower()
            # Check exact match
            if exp_lower in found_lower:
                matched.append(exp)
                continue
            # Check partial match (company name contains expected)
            for f in found:
                if exp_lower in f.lower() or f.lower() in exp_lower:
                    matched.append(exp)
                    break

        # Score: proportion of expected companies found
        score = len(matched) / len(expected)

        return score, matched

    def evaluate_single(self, test_case: Dict, verbose: bool = False) -> TestResult:
        """
        Evaluate a single test case

        Args:
            test_case: Test case dictionary
            verbose: Print details during evaluation
        """
        import time

        test_id = test_case.get("id", "unknown")
        query = test_case["query"]
        expected = test_case["expected_companies"]

        if verbose:
            print(f"\n  [{test_id}] Query: {query}")

        start_time = time.time()

        try:
            # Create chat engine and start a test conversation
            engine = NordaBizChatEngine(self.db)

            # Use a test user ID (1 = system user for testing)
            test_user_id = 1
            conversation = engine.start_conversation(
                user_id=test_user_id,
                title=f"Test: {test_id}",
                conversation_type='test'
            )

            # Send message to the conversation
            response = engine.send_message(conversation.id, query)
            response_text = response.get("response", "")

            execution_time_ms = int((time.time() - start_time) * 1000)

            # Extract companies from response
            found = self._extract_companies_from_response(response_text)

            # Calculate score
            score, matched = self._calculate_score(expected, found)

            # Determine pass/fail
            threshold = self.config.get("pass_threshold", 0.7)
            passed = score >= threshold

            if verbose:
                status = "PASS" if passed else "FAIL"
                print(f"    Status: {status} (score: {score:.2f})")
                print(f"    Expected: {expected}")
                print(f"    Found: {found}")
                if matched:
                    print(f"    Matched: {matched}")

            return TestResult(
                test_id=test_id,
                query=query,
                expected_companies=expected,
                found_companies=found,
                matched_companies=matched,
                response_text=response_text[:500],  # Truncate for storage
                score=score,
                passed=passed,
                execution_time_ms=execution_time_ms
            )

        except Exception as e:
            execution_time_ms = int((time.time() - start_time) * 1000)
            if verbose:
                print(f"    ERROR: {str(e)}")

            return TestResult(
                test_id=test_id,
                query=query,
                expected_companies=expected,
                found_companies=[],
                matched_companies=[],
                response_text="",
                score=0.0,
                passed=False,
                execution_time_ms=execution_time_ms,
                error=str(e)
            )

    def evaluate_all(self, verbose: bool = False) -> EvaluationReport:
        """
        Run all test cases and generate report

        Args:
            verbose: Print details during evaluation
        """
        test_cases = self.test_cases.get("test_cases", [])

        if verbose:
            print(f"\n{'='*60}")
            print(f"AI Quality Evaluation - {len(test_cases)} test cases")
            print(f"{'='*60}")

        results = []
        category_stats = {}

        for tc in test_cases:
            result = self.evaluate_single(tc, verbose)
            results.append(result)

            # Update category stats
            category = tc.get("category", "Other")
            if category not in category_stats:
                category_stats[category] = {"total": 0, "passed": 0, "scores": []}

            category_stats[category]["total"] += 1
            if result.passed:
                category_stats[category]["passed"] += 1
            category_stats[category]["scores"].append(result.score)

        # Calculate overall metrics
        passed = sum(1 for r in results if r.passed)
        total = len(results)
        pass_rate = passed / total if total > 0 else 0.0
        avg_score = sum(r.score for r in results) / total if total > 0 else 0.0

        # Finalize category stats
        summary_by_category = {}
        for cat, stats in category_stats.items():
            summary_by_category[cat] = {
                "total": stats["total"],
                "passed": stats["passed"],
                "pass_rate": stats["passed"] / stats["total"] if stats["total"] > 0 else 0.0,
                "avg_score": sum(stats["scores"]) / len(stats["scores"]) if stats["scores"] else 0.0
            }

        report = EvaluationReport(
            timestamp=datetime.now(),
            total_tests=total,
            passed_tests=passed,
            failed_tests=total - passed,
            pass_rate=pass_rate,
            average_score=avg_score,
            results=results,
            summary_by_category=summary_by_category
        )

        if verbose:
            self._print_summary(report)

        return report

    def _print_summary(self, report: EvaluationReport):
        """Print evaluation summary"""
        print(f"\n{'='*60}")
        print("EVALUATION SUMMARY")
        print(f"{'='*60}")
        print(f"Total tests:   {report.total_tests}")
        print(f"Passed:        {report.passed_tests}")
        print(f"Failed:        {report.failed_tests}")
        print(f"Pass rate:     {report.pass_rate:.1%}")
        print(f"Average score: {report.average_score:.2f}")

        print(f"\nBy Category:")
        for cat, stats in sorted(report.summary_by_category.items()):
            print(f"  {cat}: {stats['passed']}/{stats['total']} ({stats['pass_rate']:.0%})")

        # Show failed tests
        failed = [r for r in report.results if not r.passed]
        if failed:
            print(f"\nFailed Tests:")
            for r in failed:
                print(f"  [{r.test_id}] {r.query}")
                print(f"    Expected: {r.expected_companies}")
                print(f"    Found: {r.found_companies}")

    def close(self):
        """Close database connection"""
        self.db.close()


def main():
    """Run evaluation from command line"""
    import argparse

    parser = argparse.ArgumentParser(description="AI Quality Evaluator for Norda Biznes")
    parser.add_argument("--verbose", "-v", action="store_true", help="Verbose output")
    parser.add_argument("--test-cases", "-t", type=str, help="Path to test cases JSON")
    args = parser.parse_args()

    evaluator = AIQualityEvaluator(args.test_cases)

    try:
        report = evaluator.evaluate_all(verbose=args.verbose)

        # Return exit code based on pass rate
        threshold = 0.7
        if report.pass_rate >= threshold:
            print(f"\nEvaluation PASSED (pass rate: {report.pass_rate:.1%} >= {threshold:.0%})")
            sys.exit(0)
        else:
            print(f"\nEvaluation FAILED (pass rate: {report.pass_rate:.1%} < {threshold:.0%})")
            sys.exit(1)

    finally:
        evaluator.close()


if __name__ == "__main__":
    main()