#!/usr/bin/env python3
"""Build soccer league historical betting files from football-data.co.uk-style CSVs (Kaggle).

Inputs:
  data/kaggle/{league}_{season}.csv where league in (epl,bundesliga,laliga,ligue1,seriea)
  season token examples: 1920, 2021, 2122, 2223, 2324

Outputs:
  data/betting/{league}_historical.json (standard schema)

We use AvgH/AvgD/AvgA (decimal odds) when available, else B365H/B365D/B365A.
We also ingest Asian handicap (AHh) as spreadHome and infer spreadAway.
Total line is set to 2.5 when over/under odds exist.
"""

import csv
import json
import re
from datetime import datetime
from pathlib import Path
from typing import Optional

BASE = Path('/var/www/html/eventheodds/data')
KAGGLE = BASE / 'kaggle'
BETTING = BASE / 'betting'

LEAGUES = ['epl','bundesliga','laliga','ligue1','seriea']


def parse_date(d: str) -> Optional[str]:
    d = (d or '').strip()
    if not d:
        return None
    # data is often dd/mm/YYYY
    for fmt in ['%d/%m/%Y', '%d/%m/%y', '%Y-%m-%d']:
        try:
            return datetime.strptime(d, fmt).strftime('%Y-%m-%d')
        except Exception:
            pass
    return None


def to_float(x) -> Optional[float]:
    x = (x or '').strip()
    if not x or x.lower() in ('na','nan'):
        return None
    try:
        return float(x)
    except Exception:
        return None


def decimal_to_american(dec: Optional[float]) -> Optional[int]:
    if dec is None:
        return None
    if dec <= 1.0:
        return None
    if dec >= 2.0:
        return int(round((dec - 1.0) * 100))
    # favorite
    return int(round(-100 / (dec - 1.0)))


def infer_season_start_from_token(token: str) -> Optional[int]:
    # token like 2324 -> 2023
    m = re.fullmatch(r'(\d{2})(\d{2})', token)
    if not m:
        return None
    yy1 = int(m.group(1))
    # assume 2000s for now
    return 2000 + yy1


def build_league(league: str):
    files = sorted(KAGGLE.glob(f"{league}_*.csv"))
    out = []

    for fp in files:
        token = fp.stem.split('_',1)[1]
        season_start = infer_season_start_from_token(token) or None

        with open(fp, newline='', encoding='utf-8', errors='ignore') as f:
            r = csv.DictReader(f)
            for row in r:
                date = parse_date(row.get('Date'))
                if not date:
                    continue
                home = (row.get('HomeTeam') or '').strip()
                away = (row.get('AwayTeam') or '').strip()
                if not home or not away:
                    continue

                hs = row.get('FTHG')
                as_ = row.get('FTAG')
                try:
                    hs_i = int(hs) if hs not in (None,'') else 0
                    as_i = int(as_) if as_ not in (None,'') else 0
                except Exception:
                    continue

                # choose odds columns
                avg_h = to_float(row.get('AvgH'))
                avg_d = to_float(row.get('AvgD'))
                avg_a = to_float(row.get('AvgA'))

                if avg_h is None:
                    avg_h = to_float(row.get('B365H'))
                if avg_d is None:
                    avg_d = to_float(row.get('B365D'))
                if avg_a is None:
                    avg_a = to_float(row.get('B365A'))

                ml_home = decimal_to_american(avg_h)
                ml_draw = decimal_to_american(avg_d)
                ml_away = decimal_to_american(avg_a)

                # Asian handicap
                ahh = to_float(row.get('AHh') or row.get('AHCh'))

                # totals (2.5)
                total_line = None
                if to_float(row.get('Avg>2.5')) is not None or to_float(row.get('Avg<2.5')) is not None:
                    total_line = 2.5

                # compute results
                margin = hs_i - as_i
                winner = 'home' if margin > 0 else ('away' if margin < 0 else 'draw')

                odds = {
                    'moneylineHome': ml_home,
                    'moneylineDraw': ml_draw,
                    'moneylineAway': ml_away,
                    'spreadHome': ahh,
                    'spreadAway': (-ahh) if ahh is not None else None,
                    'totalLine': total_line,
                    'source': 'football-data.co.uk',
                }

                has_real = any(odds.get(k) is not None for k in ('moneylineHome','moneylineAway','moneylineDraw','spreadHome','totalLine'))

                season_int = season_start or int(date[:4])

                out.append({
                    'id': f"{league}-{date}-{home.lower().replace(' ','_')}-vs-{away.lower().replace(' ','_')}",
                    'bdl_game_id': None,
                    'sport': league,
                    'league': league,
                    'date': date,
                    'season': season_int,
                    'homeTeam': home,
                    'awayTeam': away,
                    'scores': {'homeScore': hs_i, 'awayScore': as_i},
                    'odds': odds,
                    'hasRealOdds': bool(has_real),
                    'result': {'winner': winner, 'margin': margin, 'totalPoints': hs_i + as_i},
                    'meta': {'seasonToken': token}
                })

    out.sort(key=lambda x: x.get('date',''))
    out_file = BETTING / f"{league}_historical.json"
    BETTING.mkdir(parents=True, exist_ok=True)
    json.dump(out, open(out_file,'w'), indent=2)

    real = sum(1 for g in out if g.get('hasRealOdds'))
    print(f"{league}: wrote {len(out)} games -> {out_file} | hasRealOdds={real} ({(real/len(out)*100) if out else 0:.1f}%)")


def main():
    for league in LEAGUES:
        build_league(league)


if __name__ == '__main__':
    main()
