#!/usr/bin/env python3
"""
Split large tick CSV into monthly files for faster loading.

Input:  BTCUSD_PAST6MONTHS.csv (2.1GB, 26.5M lines)
Output: data/monthly/BTCUSD_YYYYMM.csv (one per month, with header)

Also generates data/monthly/index.json with metadata per file.

Usage: python3 split_data.py [--source BTCUSD_PAST6MONTHS.csv]
"""

import os
import sys
import json
import time

SOURCE = sys.argv[1] if len(sys.argv) > 1 else "BTCUSD_PAST6MONTHS.csv"
OUT_DIR = "data/monthly"
os.makedirs(OUT_DIR, exist_ok=True)

print(f"Splitting {SOURCE} into monthly files...")
start = time.time()

header = None
current_month = None
current_file = None
index = {}
line_count = 0
file_line_count = 0

with open(SOURCE, "r") as f:
    for line in f:
        line_count += 1
        
        if line_count == 1:
            header = line
            continue
        
        # Extract YYYYMM from timestamp: "20250808 00:00:00:485,..."
        month_key = line[:6]  # "202508"
        
        if month_key != current_month:
            # Close previous file
            if current_file:
                current_file.close()
                index[current_month]["lines"] = file_line_count
                index[current_month]["size_mb"] = round(
                    os.path.getsize(index[current_month]["path"]) / 1024 / 1024, 1
                )
                print(f"  {current_month}: {file_line_count:,} ticks ({index[current_month]['size_mb']} MB)")
            
            # Open new file
            current_month = month_key
            filename = f"BTCUSD_{month_key}.csv"
            filepath = os.path.join(OUT_DIR, filename)
            current_file = open(filepath, "w")
            current_file.write(header)  # Write header to each file
            file_line_count = 0
            
            # First line gives us the start date
            first_ts = line.split(",")[0]
            index[month_key] = {
                "file": filename,
                "path": filepath,
                "month": f"{month_key[:4]}-{month_key[4:]}",
                "first_timestamp": first_ts,
                "lines": 0,
            }
        
        current_file.write(line)
        file_line_count += 1
        
        # Progress
        if line_count % 5000000 == 0:
            elapsed = time.time() - start
            print(f"  ... {line_count:,} lines processed ({elapsed:.0f}s)")

# Close last file
if current_file:
    current_file.close()
    index[current_month]["lines"] = file_line_count
    index[current_month]["size_mb"] = round(
        os.path.getsize(index[current_month]["path"]) / 1024 / 1024, 1
    )
    print(f"  {current_month}: {file_line_count:,} ticks ({index[current_month]['size_mb']} MB)")

# Add last_timestamp to each file
for month_key, info in index.items():
    with open(info["path"], "rb") as f:
        # Seek to end and read last line
        f.seek(0, 2)
        pos = f.tell() - 2
        while pos > 0:
            f.seek(pos)
            if f.read(1) == b"\n":
                break
            pos -= 1
        last_line = f.readline().decode()
        info["last_timestamp"] = last_line.split(",")[0]

# Save index
index_path = os.path.join(OUT_DIR, "index.json")
with open(index_path, "w") as f:
    json.dump(index, f, indent=2)

elapsed = time.time() - start
total_ticks = line_count - 1
print(f"\nDone! Split {total_ticks:,} ticks into {len(index)} monthly files in {elapsed:.0f}s")
print(f"Index: {index_path}")
print(f"\nMonths available:")
for k, v in sorted(index.items()):
    print(f"  {v['month']}: {v['lines']:,} ticks ({v['size_mb']} MB)")