nordabiz/run_ai_quality_tests.py
2026-01-01 14:01:49 +01:00

152 lines
5.3 KiB
Python

#!/usr/bin/env python3
"""
AI Quality Test Runner for Norda Biznes Hub
Usage:
python run_ai_quality_tests.py [--verbose] [--save]
Options:
--verbose, -v Show detailed output for each test
--save, -s Save results to JSON file
--quick, -q Run only high-priority tests
"""
import argparse
import json
import sys
from datetime import datetime
from pathlib import Path
# Add project root to path
sys.path.insert(0, str(Path(__file__).parent))
from tests.ai_quality_evaluator import AIQualityEvaluator, EvaluationReport
def save_report(report: EvaluationReport, output_dir: Path = None):
"""Save evaluation report to JSON file"""
if output_dir is None:
output_dir = Path(__file__).parent / "tests" / "results"
output_dir.mkdir(parents=True, exist_ok=True)
timestamp = report.timestamp.strftime("%Y%m%d_%H%M%S")
filename = f"ai_quality_report_{timestamp}.json"
filepath = output_dir / filename
# Convert report to serializable dict
report_dict = {
"timestamp": report.timestamp.isoformat(),
"total_tests": report.total_tests,
"passed_tests": report.passed_tests,
"failed_tests": report.failed_tests,
"pass_rate": report.pass_rate,
"average_score": report.average_score,
"summary_by_category": report.summary_by_category,
"results": [
{
"test_id": r.test_id,
"query": r.query,
"expected_companies": r.expected_companies,
"found_companies": r.found_companies,
"matched_companies": r.matched_companies,
"score": r.score,
"passed": r.passed,
"execution_time_ms": r.execution_time_ms,
"error": r.error
}
for r in report.results
]
}
with open(filepath, 'w', encoding='utf-8') as f:
json.dump(report_dict, f, indent=2, ensure_ascii=False)
print(f"\nReport saved to: {filepath}")
return filepath
def run_quick_tests(evaluator: AIQualityEvaluator, verbose: bool = False) -> EvaluationReport:
"""Run only high-priority tests"""
test_cases = evaluator.test_cases.get("test_cases", [])
high_priority = [tc for tc in test_cases if tc.get("priority") == "high"]
# Temporarily replace test cases
original_cases = evaluator.test_cases["test_cases"]
evaluator.test_cases["test_cases"] = high_priority
print(f"Running {len(high_priority)} high-priority tests...")
report = evaluator.evaluate_all(verbose=verbose)
# Restore original
evaluator.test_cases["test_cases"] = original_cases
return report
def main():
parser = argparse.ArgumentParser(
description="AI Quality Test Runner for Norda Biznes Hub",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
python run_ai_quality_tests.py -v # Verbose output
python run_ai_quality_tests.py -v -s # Verbose + save report
python run_ai_quality_tests.py -q # Quick (high-priority only)
"""
)
parser.add_argument("--verbose", "-v", action="store_true",
help="Show detailed output for each test")
parser.add_argument("--save", "-s", action="store_true",
help="Save results to JSON file")
parser.add_argument("--quick", "-q", action="store_true",
help="Run only high-priority tests")
parser.add_argument("--threshold", "-t", type=float, default=0.7,
help="Pass rate threshold (default: 0.7)")
args = parser.parse_args()
print(f"""
╔════════════════════════════════════════════════════════════╗
║ Norda Biznes Hub - AI Quality Tests ║
╠════════════════════════════════════════════════════════════╣
║ Evaluating chat AI response quality ║
║ Pass threshold: {args.threshold:.0%}
╚════════════════════════════════════════════════════════════╝
""")
evaluator = AIQualityEvaluator()
try:
if args.quick:
report = run_quick_tests(evaluator, verbose=args.verbose)
else:
report = evaluator.evaluate_all(verbose=args.verbose)
if args.save:
save_report(report)
# Final verdict
print(f"\n{'='*60}")
if report.pass_rate >= args.threshold:
print(f"✓ EVALUATION PASSED")
print(f" Pass rate: {report.pass_rate:.1%} >= {args.threshold:.0%}")
exit_code = 0
else:
print(f"✗ EVALUATION FAILED")
print(f" Pass rate: {report.pass_rate:.1%} < {args.threshold:.0%}")
exit_code = 1
print(f" Tests: {report.passed_tests}/{report.total_tests} passed")
print(f" Average score: {report.average_score:.2f}")
print(f"{'='*60}")
sys.exit(exit_code)
finally:
evaluator.close()
if __name__ == "__main__":
main()