"""
Data Quality Validation Script for EventheOdds.ai

Validates betting historical data to prevent biased or corrupted data from being used.
Run this script before deployment or as part of CI/CD pipeline.

Exit Codes:
  0 - All validations passed
  1 - Validation failed (data quality issues detected)
"""

import json
import os
import sys
from typing import Dict, List, Tuple

# Thresholds for suspicious data
# Note: 62% allows for ~2 std deviations on samples of 300+ games
MAX_COVER_RATE = 0.62  # 62% cover rate is suspicious
MIN_COVER_RATE = 0.30  # 30% cover rate is minimum realistic
MAX_DUPLICATE_RATE = 0.05  # 5% duplicate rate is suspicious
MIN_GAMES = 50  # Minimum games needed for valid analysis

DATA_DIR = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'data', 'betting')

def validate_sport(sport: str, filepath: str) -> Tuple[bool, List[str]]:
    """Validate data quality for a single sport."""
    errors = []
    warnings = []
    
    if not os.path.exists(filepath):
        errors.append(f"{sport.upper()}: File not found: {filepath}")
        return False, errors
    
    try:
        with open(filepath, 'r') as f:
            data = json.load(f)
    except json.JSONDecodeError as e:
        errors.append(f"{sport.upper()}: Invalid JSON: {e}")
        return False, errors
    
    if not isinstance(data, list):
        errors.append(f"{sport.upper()}: Data is not a list")
        return False, errors
    
    total_games = len(data)
    if total_games < MIN_GAMES:
        warnings.append(f"{sport.upper()}: Only {total_games} games (minimum: {MIN_GAMES})")
    
    # Check for duplicates
    ids = [g.get('id') for g in data if g.get('id')]
    unique_ids = set(ids)
    duplicate_count = len(ids) - len(unique_ids)
    duplicate_rate = duplicate_count / max(len(ids), 1)
    
    if duplicate_rate > MAX_DUPLICATE_RATE:
        errors.append(f"{sport.upper()}: High duplicate rate: {duplicate_rate*100:.1f}% ({duplicate_count} duplicates)")
    
    # Calculate cover rate for home favorites
    home_fav_covered = 0
    home_fav_not_covered = 0
    
    for g in data:
        spread = g.get('odds', {}).get('spreadHome', 0) or 0
        home_score = g.get('scores', {}).get('homeScore', 0) or 0
        away_score = g.get('scores', {}).get('awayScore', 0) or 0
        margin = home_score - away_score
        
        if spread < 0:  # Home is favorite
            if margin > abs(spread):
                home_fav_covered += 1
            else:
                home_fav_not_covered += 1
    
    total_favorites = home_fav_covered + home_fav_not_covered
    
    if total_favorites > 0:
        cover_rate = home_fav_covered / total_favorites
        
        if cover_rate > MAX_COVER_RATE:
            errors.append(
                f"{sport.upper()}: BIASED DATA - Cover rate {cover_rate*100:.1f}% exceeds max {MAX_COVER_RATE*100}% "
                f"({home_fav_covered} covered, {home_fav_not_covered} not covered)"
            )
        elif cover_rate < MIN_COVER_RATE:
            warnings.append(
                f"{sport.upper()}: Unusually low cover rate: {cover_rate*100:.1f}% "
                f"({home_fav_covered} covered, {home_fav_not_covered} not covered)"
            )
    
    # Print warnings but don't fail on them
    for w in warnings:
        print(f"  ⚠️  {w}")
    
    if errors:
        return False, errors
    
    print(f"  ✅ {sport.upper()}: {total_games} games, cover rate: {cover_rate*100:.1f}% (valid)")
    return True, []


def main():
    print("="*60)
    print("EventheOdds.ai Data Quality Validation")
    print("="*60)
    print()
    
    sports = ['nba', 'nfl', 'nhl', 'mlb', 'epl']
    all_passed = True
    all_errors = []
    
    for sport in sports:
        filepath = os.path.join(DATA_DIR, f'{sport}_historical.json')
        passed, errors = validate_sport(sport, filepath)
        
        if not passed:
            all_passed = False
            all_errors.extend(errors)
    
    print()
    print("="*60)
    
    if all_passed:
        print("✅ VALIDATION PASSED - All data quality checks passed")
        print("="*60)
        return 0
    else:
        print("❌ VALIDATION FAILED - Data quality issues detected:")
        print()
        for error in all_errors:
            print(f"  🔴 {error}")
        print()
        print("="*60)
        print("ACTION REQUIRED: Fix data quality issues before deploying.")
        print("Run: python3 scripts/fetch_balldontlie_data.py --update-betting")
        print("="*60)
        return 1


if __name__ == '__main__':
    sys.exit(main())
