#!/usr/bin/env python3
"""Build/merge betting/mlb_historical.json from Kaggle oddsDataMLB.csv.

Source:
  data/kaggle/oddsDataMLB.csv

Important:
- Kaggle file is team-perspective, not explicit home/away.
- To safely ingest without guessing venue, we create ONE canonical row per game:
  key = (date, min(team_code, opponent_code), max(team_code, opponent_code))
  and keep the Kaggle row where team_code == min(team_code, opponent_code).

This yields consistent, non-duplicated historical games suitable for most
odds-based strategies (favorite/underdog, spreads, totals), but "home" is
an arbitrary orientation, not true venue.

We merge into existing betting/mlb_historical.json by (date, homeTeam, awayTeam)
(normalized), filling missing odds fields; and we ADD new Kaggle historical games.
"""

import csv
import json
from pathlib import Path
from typing import Dict, Tuple, Optional

BASE = Path('/var/www/html/eventheodds/data')
SRC = BASE / 'kaggle' / 'oddsDataMLB.csv'
OUT = BASE / 'betting' / 'mlb_historical.json'

CODE_TO_NAME = {
    'ARI': 'Arizona Diamondbacks',
    'ATL': 'Atlanta Braves',
    'BAL': 'Baltimore Orioles',
    'BOS': 'Boston Red Sox',
    'CHC': 'Chicago Cubs',
    'CWS': 'Chicago White Sox',
    'CIN': 'Cincinnati Reds',
    'CLE': 'Cleveland Guardians',
    'COL': 'Colorado Rockies',
    'DET': 'Detroit Tigers',
    'HOU': 'Houston Astros',
    'KC': 'Kansas City Royals',
    'LAA': 'Los Angeles Angels',
    'LAD': 'Los Angeles Dodgers',
    'MIA': 'Miami Marlins',
    'MIL': 'Milwaukee Brewers',
    'MIN': 'Minnesota Twins',
    'NYM': 'New York Mets',
    'NYY': 'New York Yankees',
    'OAK': 'Oakland Athletics',
    'PHI': 'Philadelphia Phillies',
    'PIT': 'Pittsburgh Pirates',
    'SD':  'San Diego Padres',
    'SEA': 'Seattle Mariners',
    'SF':  'San Francisco Giants',
    'STL': 'St. Louis Cardinals',
    'TB':  'Tampa Bay Rays',
    'TEX': 'Texas Rangers',
    'TOR': 'Toronto Blue Jays',
    'WSH': 'Washington Nationals',
}


def norm(s: str) -> str:
    return ' '.join((s or '').strip().lower().split())


def to_int(x) -> Optional[int]:
    if x is None:
        return None
    s = str(x).strip()
    if s == '' or s.lower() in ('na','nan'):
        return None
    try:
        return int(float(s))
    except Exception:
        return None


def to_float(x) -> Optional[float]:
    if x is None:
        return None
    s = str(x).strip()
    if s == '' or s.lower() in ('na','nan'):
        return None
    try:
        return float(s)
    except Exception:
        return None


def build_game_from_row(row: dict) -> Optional[dict]:
    date = (row.get('date') or '').strip()
    season = to_int(row.get('season')) or (int(date[:4]) if date else None)
    team = (row.get('team') or '').strip()
    opp = (row.get('opponent') or '').strip()
    if not date or not team or not opp or not season:
        return None

    home_code = team
    away_code = opp

    home = CODE_TO_NAME.get(home_code, home_code)
    away = CODE_TO_NAME.get(away_code, away_code)

    sh = to_int(row.get('runs'))
    sa = to_int(row.get('oppRuns'))
    if sh is None or sa is None:
        return None

    ml_home = to_int(row.get('moneyLine'))
    ml_away = to_int(row.get('oppMoneyLine'))
    spread_home = to_float(row.get('runLine'))
    spread_away = to_float(row.get('oppRunLine'))
    total = to_float(row.get('total'))

    has_real = any(v is not None for v in (ml_home, ml_away, spread_home, spread_away, total))

    # results
    if sh > sa:
        winner = 'home'
    elif sa > sh:
        winner = 'away'
    else:
        winner = 'draw'

    spread_covered = None
    if spread_home is not None:
        margin = sh - sa
        adj = margin + spread_home
        if adj > 0:
            spread_covered = 'home'
        elif adj < 0:
            spread_covered = 'away'
        else:
            spread_covered = 'push'

    total_result = None
    if total is not None:
        tp = sh + sa
        if tp > total:
            total_result = 'over'
        elif tp < total:
            total_result = 'under'
        else:
            total_result = 'push'

    gid = f"kaggle-mlb-neutral-{date}-{norm(away).replace(' ','_')}-at-{norm(home).replace(' ','_')}"

    return {
        'id': gid,
        'bdl_game_id': None,
        'sport': 'mlb',
        'date': date,
        'season': season,
        'homeTeam': home,
        'awayTeam': away,
        'scores': {'homeScore': sh, 'awayScore': sa},
        'odds': {
            'moneylineHome': ml_home,
            'moneylineAway': ml_away,
            'spreadHome': spread_home,
            'spreadAway': spread_away,
            'totalLine': total,
            'source': 'kaggle_neutral',
        },
        'hasRealOdds': bool(has_real),
        'result': {
            'winner': winner,
            'spreadCovered': spread_covered,
            'totalResult': total_result,
            'margin': sh - sa,
            'totalPoints': sh + sa,
        },
        'meta': {
            'note': 'Kaggle MLB oddsDataMLB ingested with neutral orientation; home/away is not true venue.'
        }
    }


def main():
    if not SRC.exists():
        raise SystemExit(f"Missing {SRC}")

    existing = []
    if OUT.exists():
        try:
            existing = json.load(open(OUT, 'r'))
        except Exception:
            existing = []

    # Index existing by date+teams
    idx: Dict[Tuple[str, str, str], dict] = {}
    for g in existing:
        key = (g.get('date') or '', norm(g.get('homeTeam','')), norm(g.get('awayTeam','')))
        if key[0] and key[1] and key[2]:
            idx[key] = g

    added = 0
    updated = 0

    # Dedup Kaggle rows to one per actual game (canonical by min(team,opp))
    kept = 0
    seen: Dict[Tuple[str, str, str], dict] = {}

    with open(SRC, newline='', encoding='utf-8') as f:
        r = csv.DictReader(f)
        for row in r:
            date = (row.get('date') or '').strip()
            team = (row.get('team') or '').strip()
            opp = (row.get('opponent') or '').strip()
            if not date or not team or not opp:
                continue
            a, b = sorted([team, opp])
            k = (date, a, b)
            # Keep only the row where team == min(team,opp)
            if team != a:
                continue
            seen[k] = row

    for (date, a, b), row in seen.items():
        rec = build_game_from_row(row)
        if not rec:
            continue
        key = (rec.get('date') or '', norm(rec.get('homeTeam','')), norm(rec.get('awayTeam','')))
        if key in idx:
            cur = idx[key]
            cur_odds = cur.get('odds') or {}
            new_odds = rec.get('odds') or {}
            # fill missing
            for k2 in ['moneylineHome','moneylineAway','spreadHome','spreadAway','totalLine']:
                if cur_odds.get(k2) is None and new_odds.get(k2) is not None:
                    cur_odds[k2] = new_odds.get(k2)
            if cur_odds.get('source') in (None, 'estimated_DO_NOT_BACKTEST'):
                cur_odds['source'] = new_odds.get('source','kaggle_neutral')
            cur['odds'] = cur_odds
            cur['hasRealOdds'] = bool(cur.get('hasRealOdds') or any(cur_odds.get(k2) is not None for k2 in ('moneylineHome','moneylineAway','spreadHome','totalLine')))
            updated += 1
        else:
            idx[key] = rec
            added += 1
        kept += 1

    out = list(idx.values())
    out.sort(key=lambda x: x.get('date',''))

    OUT.parent.mkdir(parents=True, exist_ok=True)
    json.dump(out, open(OUT, 'w'), indent=2)

    real = sum(1 for g in out if g.get('hasRealOdds'))
    print(f"Merged MLB Kaggle into {OUT}")
    print(f"Existing: {len(existing)} | Kaggle canonical games: {kept} | Added: {added} | Updated: {updated}")
    print(f"Total: {len(out)} | hasRealOdds: {real} ({(real/len(out)*100) if out else 0:.1f}%)")


if __name__ == '__main__':
    main()
