#!/usr/bin/env python3
"""
Data Quality Check Script
Audits SportsDB data for completeness and consistency
Run: Daily 6:00 AM UTC
"""
import os
import json
import psycopg2
from datetime import datetime, timezone, timedelta

def load_db_url():
    """Load database URL from environment"""
    env_paths = [
        '/var/www/html/eventheodds/.env',
        os.path.join(os.path.dirname(__file__), '..', '.env'),
    ]
    for env_path in env_paths:
        try:
            with open(env_path, 'r') as f:
                for line in f:
                    if line.startswith('SPORTS_DATABASE_URL='):
                        return line.split('=', 1)[1].strip().split('?')[0]
        except FileNotFoundError:
            continue
    return os.environ.get('SPORTS_DATABASE_URL', '').split('?')[0]


def run_quality_checks(conn):
    """Run data quality checks and return results"""
    cur = conn.cursor()
    results = {
        'timestamp': datetime.now(timezone.utc).isoformat(),
        'checks': [],
        'warnings': [],
        'errors': [],
    }

    # Check 1: Recent games count by league (last 7 days)
    print("Checking recent games by league...")
    cur.execute('''
        SELECT league, COUNT(*) as count
        FROM "SportsGame"
        WHERE "gameDate" >= NOW() - INTERVAL '7 days'
        GROUP BY league
        ORDER BY league
    ''')
    recent_games = {row[0]: row[1] for row in cur.fetchall()}
    results['checks'].append({
        'name': 'recent_games_by_league',
        'data': recent_games,
        'total': sum(recent_games.values())
    })

    # Check 2: Games missing odds (last 7 days)
    print("Checking games missing odds...")
    cur.execute('''
        SELECT league, COUNT(*) as count
        FROM "SportsGame"
        WHERE "gameDate" >= NOW() - INTERVAL '7 days'
        AND "moneylineHome" IS NULL
        AND "spreadHome" IS NULL
        GROUP BY league
        ORDER BY league
    ''')
    missing_odds = {row[0]: row[1] for row in cur.fetchall()}
    if missing_odds:
        results['warnings'].append({
            'type': 'missing_odds',
            'data': missing_odds,
            'message': f'{sum(missing_odds.values())} games missing odds in last 7 days'
        })
    results['checks'].append({
        'name': 'games_missing_odds',
        'data': missing_odds
    })

    # Check 3: Injury data freshness
    print("Checking injury data freshness...")
    cur.execute('''
        SELECT league,
               COUNT(*) as total,
               MAX("updatedAt") as last_update
        FROM "PlayerInjury"
        GROUP BY league
        ORDER BY league
    ''')
    injury_freshness = {}
    stale_injuries = []
    for row in cur.fetchall():
        league, total, last_update = row
        injury_freshness[league] = {
            'count': total,
            'last_update': last_update.isoformat() if last_update else None
        }
        if last_update and (datetime.now(timezone.utc) - last_update.replace(tzinfo=timezone.utc)) > timedelta(hours=12):
            stale_injuries.append(league)

    if stale_injuries:
        results['warnings'].append({
            'type': 'stale_injuries',
            'leagues': stale_injuries,
            'message': f'Injury data older than 12h for: {", ".join(stale_injuries)}'
        })
    results['checks'].append({
        'name': 'injury_freshness',
        'data': injury_freshness
    })

    # Check 4: Player props count (last 24h)
    print("Checking recent player props...")
    cur.execute('''
        SELECT COUNT(*) FROM "PlayerPropLine"
        WHERE "createdAt" >= NOW() - INTERVAL '24 hours'
    ''')
    recent_props = cur.fetchone()[0]
    results['checks'].append({
        'name': 'props_last_24h',
        'count': recent_props
    })
    if recent_props < 100:
        results['warnings'].append({
            'type': 'low_props',
            'count': recent_props,
            'message': f'Only {recent_props} props ingested in last 24h'
        })

    # Check 5: External feed record status
    print("Checking external feed records...")
    cur.execute('''
        SELECT source, COUNT(*) as count,
               MAX("createdAt") as last_ingest
        FROM "ExternalFeedRecord"
        WHERE "createdAt" >= NOW() - INTERVAL '24 hours'
        GROUP BY source
        ORDER BY source
    ''')
    feed_status = {}
    for row in cur.fetchall():
        source, count, last_ingest = row
        feed_status[source] = {
            'count_24h': count,
            'last_ingest': last_ingest.isoformat() if last_ingest else None
        }
    results['checks'].append({
        'name': 'external_feeds_24h',
        'data': feed_status
    })

    # Check 6: Duplicate detection
    print("Checking for duplicates...")
    cur.execute('''
        SELECT league, "externalGameId", COUNT(*) as dupes
        FROM "SportsGame"
        WHERE "externalGameId" IS NOT NULL
        GROUP BY league, "externalGameId"
        HAVING COUNT(*) > 1
        LIMIT 10
    ''')
    duplicates = cur.fetchall()
    if duplicates:
        results['errors'].append({
            'type': 'duplicate_games',
            'count': len(duplicates),
            'samples': [{'league': d[0], 'externalId': d[1], 'count': d[2]} for d in duplicates[:5]]
        })

    cur.close()
    return results


def main():
    print("=" * 60)
    print("DATA QUALITY CHECK")
    print(f"Time: {datetime.now(timezone.utc).isoformat()}")
    print("=" * 60)

    db_url = load_db_url()
    if not db_url:
        print("ERROR: SPORTS_DATABASE_URL not found")
        return

    try:
        conn = psycopg2.connect(db_url)
        results = run_quality_checks(conn)
        conn.close()

        # Save results
        output_dir = '/var/www/html/eventheodds/data'
        os.makedirs(output_dir, exist_ok=True)
        output_path = os.path.join(output_dir, 'quality_check_results.json')
        with open(output_path, 'w') as f:
            json.dump(results, f, indent=2, default=str)

        # Print summary
        print("\n" + "=" * 60)
        print("SUMMARY")
        print("=" * 60)
        print(f"Checks run: {len(results['checks'])}")
        print(f"Warnings: {len(results['warnings'])}")
        print(f"Errors: {len(results['errors'])}")

        if results['warnings']:
            print("\nWARNINGS:")
            for w in results['warnings']:
                print(f"  - {w['message']}")

        if results['errors']:
            print("\nERRORS:")
            for e in results['errors']:
                print(f"  - {e['type']}: {e.get('count', 'N/A')} issues")

        print(f"\nResults saved to {output_path}")

    except Exception as e:
        print(f"ERROR: {e}")


if __name__ == '__main__':
    main()
