#!/usr/bin/env python3
"""
Lightweight RAG Service for Sports Betting Knowledge Base

Uses OpenAI embeddings for simplicity and reliability.
Processes sports betting PDFs and serves knowledge retrieval.
"""

import os
import json
import time
import hashlib
from pathlib import Path
from flask import Flask, request, jsonify
from functools import wraps

# Optional: Try to import PDF processing
try:
    import fitz  # PyMuPDF
    HAS_PYMUPDF = True
except ImportError:
    HAS_PYMUPDF = False
    try:
        from pypdf import PdfReader
        HAS_PYPDF = True
    except ImportError:
        HAS_PYPDF = False

# Optional: Try to import numpy/faiss for vector search
try:
    import numpy as np
    HAS_NUMPY = True
except ImportError:
    HAS_NUMPY = False

app = Flask(__name__)

# Configuration
DATA_DIR = Path(os.environ.get('DATA_DIR', '/var/www/html/eventheodds/data'))
RAG_CACHE_DIR = DATA_DIR / 'rag_cache'
PDF_DIR = DATA_DIR / 'csv'  # Sports betting PDFs are in csv folder

FLASK_API_KEY = os.environ.get('FLASK_API_KEY')
OPENAI_API_KEY = os.environ.get('OPENAI_API_KEY')
GROK_API_KEY = os.environ.get('GROK_API_KEY')

# In-memory knowledge base
knowledge_base = []
knowledge_embeddings = []

def require_api_key(f):
    """Require API key for protected endpoints"""
    @wraps(f)
    def decorated(*args, **kwargs):
        api_key = request.headers.get('X-API-Key') or request.headers.get('Authorization', '').replace('Bearer ', '')
        if not api_key or api_key != FLASK_API_KEY:
            return jsonify({'error': 'Unauthorized'}), 401
        return f(*args, **kwargs)
    return decorated


def extract_text_from_pdf(pdf_path: Path) -> str:
    """Extract text from PDF file"""
    text = ""
    
    if HAS_PYMUPDF:
        try:
            doc = fitz.open(str(pdf_path))
            for page in doc:
                text += page.get_text()
            doc.close()
            return text
        except Exception as e:
            print(f"PyMuPDF failed: {e}")
    
    if HAS_PYPDF:
        try:
            reader = PdfReader(str(pdf_path))
            for page in reader.pages:
                text += page.extract_text() or ""
            return text
        except Exception as e:
            print(f"PyPDF failed: {e}")
    
    return text


def chunk_text(text: str, chunk_size: int = 1000, overlap: int = 200) -> list:
    """Split text into overlapping chunks"""
    chunks = []
    start = 0
    
    while start < len(text):
        end = start + chunk_size
        chunk = text[start:end]
        
        # Try to break at sentence boundary
        if end < len(text):
            last_period = chunk.rfind('.')
            if last_period > chunk_size // 2:
                chunk = chunk[:last_period + 1]
                end = start + last_period + 1
        
        if chunk.strip():
            chunks.append({
                'content': chunk.strip(),
                'start': start,
                'end': end
            })
        
        start = end - overlap
    
    return chunks


def get_embedding(text: str) -> list:
    """Get embedding using OpenAI or simple TF-IDF fallback"""
    import requests
    
    # Try OpenAI
    if OPENAI_API_KEY:
        try:
            resp = requests.post(
                'https://api.openai.com/v1/embeddings',
                headers={
                    'Authorization': f'Bearer {OPENAI_API_KEY}',
                    'Content-Type': 'application/json'
                },
                json={
                    'model': 'text-embedding-3-small',
                    'input': text[:8000]  # Limit input
                },
                timeout=30
            )
            if resp.ok:
                return resp.json()['data'][0]['embedding']
        except Exception as e:
            print(f"OpenAI embedding failed: {e}")
    
    # Fallback: Simple hash-based "embedding" for keyword matching
    words = text.lower().split()
    # Create a simple 128-dim vector based on word hashes
    vec = [0.0] * 128
    for word in words:
        h = hash(word) % 128
        vec[h] += 1.0
    # Normalize
    norm = sum(v*v for v in vec) ** 0.5
    if norm > 0:
        vec = [v / norm for v in vec]
    return vec


def cosine_similarity(a: list, b: list) -> float:
    """Calculate cosine similarity between two vectors"""
    if not HAS_NUMPY:
        dot = sum(x*y for x, y in zip(a, b))
        norm_a = sum(x*x for x in a) ** 0.5
        norm_b = sum(x*x for x in b) ** 0.5
        return dot / (norm_a * norm_b) if norm_a and norm_b else 0.0
    else:
        a = np.array(a)
        b = np.array(b)
        return float(np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b)))


def load_knowledge_base():
    """Load and process sports betting PDFs"""
    global knowledge_base, knowledge_embeddings
    
    cache_file = RAG_CACHE_DIR / 'knowledge_cache.json'
    RAG_CACHE_DIR.mkdir(parents=True, exist_ok=True)
    
    # Check cache
    if cache_file.exists():
        try:
            with open(cache_file) as f:
                cache = json.load(f)
            knowledge_base = cache.get('chunks', [])
            knowledge_embeddings = cache.get('embeddings', [])
            print(f"[RAG] Loaded {len(knowledge_base)} chunks from cache")
            return
        except Exception as e:
            print(f"[RAG] Cache load failed: {e}")
    
    # Process PDFs
    pdf_files = list(PDF_DIR.glob('*.pdf'))
    print(f"[RAG] Found {len(pdf_files)} PDF files")
    
    all_chunks = []
    for pdf_path in pdf_files:
        print(f"[RAG] Processing: {pdf_path.name}")
        text = extract_text_from_pdf(pdf_path)
        if text:
            chunks = chunk_text(text)
            for chunk in chunks:
                chunk['source'] = pdf_path.name
            all_chunks.extend(chunks)
            print(f"[RAG] Extracted {len(chunks)} chunks from {pdf_path.name}")
    
    # Add built-in sports betting knowledge
    builtin_knowledge = [
        {
            'content': 'Moneyline betting is the simplest form of sports betting. You pick the team you think will win. Favorites have negative odds (e.g., -150), underdogs have positive odds (e.g., +130). To profit $100 on a -150 favorite, you must bet $150.',
            'source': 'builtin_sports_knowledge'
        },
        {
            'content': 'Point spread betting levels the playing field. The favorite must win by more than the spread to cover. For example, if Lakers are -5.5, they must win by 6+ points. Underdogs can lose by less than the spread and still cover.',
            'source': 'builtin_sports_knowledge'
        },
        {
            'content': 'Over/Under (totals) betting predicts whether the combined score will be over or under a set number. For NBA, totals typically range from 210-240. Weather, injuries, and pace of play affect totals.',
            'source': 'builtin_sports_knowledge'
        },
        {
            'content': 'Parlay betting combines multiple bets into one. All legs must win for the parlay to pay. Higher risk but higher reward. A 3-team parlay might pay 6:1. Round robins create multiple smaller parlays from your selections.',
            'source': 'builtin_sports_knowledge'
        },
        {
            'content': 'Sharp bettors look for value - when odds underestimate a teams true probability. Line movement shows where sharp money is going. Fading the public (betting against popular opinion) can be profitable.',
            'source': 'builtin_sports_knowledge'
        },
        {
            'content': 'Bankroll management is crucial. Never bet more than 1-5% of your bankroll on a single bet. The Kelly Criterion helps calculate optimal bet sizing based on edge and odds.',
            'source': 'builtin_sports_knowledge'
        },
        {
            'content': 'Home field advantage matters. NBA home teams win about 55-60% of games. NFL home teams cover the spread about 50.5% of the time. Factor this into your analysis.',
            'source': 'builtin_sports_knowledge'
        },
        {
            'content': 'Injuries significantly impact betting lines. Star player absences can move lines 3-7 points in NBA. Always check injury reports before placing bets.',
            'source': 'builtin_sports_knowledge'
        },
        {
            'content': 'Back-to-back games in NBA create fatigue situations. Teams playing their second game in two nights cover less often. Look for rest advantage spots.',
            'source': 'builtin_sports_knowledge'
        },
        {
            'content': 'NFL divisional games are often closer than the spread suggests. Teams know each other well. Consider fading large spreads in divisional matchups.',
            'source': 'builtin_sports_knowledge'
        }
    ]
    
    all_chunks.extend(builtin_knowledge)
    
    print(f"[RAG] Generating embeddings for {len(all_chunks)} chunks...")
    embeddings = []
    for i, chunk in enumerate(all_chunks):
        emb = get_embedding(chunk['content'])
        embeddings.append(emb)
        if (i + 1) % 50 == 0:
            print(f"[RAG] Processed {i + 1}/{len(all_chunks)} embeddings")
    
    knowledge_base = all_chunks
    knowledge_embeddings = embeddings
    
    # Save cache
    try:
        with open(cache_file, 'w') as f:
            json.dump({
                'chunks': all_chunks,
                'embeddings': embeddings,
                'created': time.time()
            }, f)
        print(f"[RAG] Saved cache with {len(all_chunks)} chunks")
    except Exception as e:
        print(f"[RAG] Failed to save cache: {e}")


def search_knowledge(query: str, k: int = 5) -> list:
    """Search knowledge base for relevant chunks"""
    if not knowledge_base:
        return []
    
    query_embedding = get_embedding(query)
    
    # Calculate similarities
    scores = []
    for i, emb in enumerate(knowledge_embeddings):
        sim = cosine_similarity(query_embedding, emb)
        scores.append((sim, i))
    
    # Sort by similarity
    scores.sort(reverse=True)
    
    results = []
    for score, idx in scores[:k]:
        chunk = knowledge_base[idx]
        results.append({
            'content': chunk['content'],
            'source': chunk.get('source', 'unknown'),
            'score': score
        })
    
    return results


@app.route('/health', methods=['GET'])
def health():
    """Health check endpoint"""
    return jsonify({
        'status': 'healthy',
        'chunks_loaded': len(knowledge_base),
        'embeddings_loaded': len(knowledge_embeddings)
    })


@app.route('/upload', methods=['POST'])
@require_api_key
def upload_file():
    """Upload a PDF or text file to the RAG knowledge base"""
    global knowledge_base, knowledge_embeddings
    
    try:
        if 'file' not in request.files:
            return jsonify({'error': 'No file provided'}), 400
        
        file = request.files['file']
        if file.filename == '':
            return jsonify({'error': 'No file selected'}), 400
        
        # Check file extension
        allowed_extensions = {'.pdf', '.txt'}
        ext = Path(file.filename).suffix.lower()
        if ext not in allowed_extensions:
            return jsonify({'error': f'File type not allowed. Allowed: {allowed_extensions}'}), 400
        
        # Save file to PDF directory
        safe_filename = file.filename.replace(' ', '_')
        file_path = PDF_DIR / safe_filename
        file.save(str(file_path))
        
        print(f"[RAG] Uploaded file: {safe_filename}")
        
        # Process the file
        if ext == '.pdf':
            text = extract_text_from_pdf(file_path)
        else:
            text = file_path.read_text()
        
        if not text:
            return jsonify({'error': 'Could not extract text from file'}), 400
        
        # Create chunks
        chunks = chunk_text(text)
        for chunk in chunks:
            chunk['source'] = safe_filename
        
        # Generate embeddings and add to knowledge base
        new_embeddings = []
        for chunk in chunks:
            emb = get_embedding(chunk['content'])
            new_embeddings.append(emb)
        
        knowledge_base.extend(chunks)
        knowledge_embeddings.extend(new_embeddings)
        
        # Update cache
        cache_file = RAG_CACHE_DIR / 'knowledge_cache.json'
        try:
            with open(cache_file, 'w') as f:
                json.dump({
                    'chunks': knowledge_base,
                    'embeddings': knowledge_embeddings,
                    'updated': time.time()
                }, f)
        except Exception as e:
            print(f"[RAG] Cache update failed: {e}")
        
        return jsonify({
            'success': True,
            'message': f'File "{safe_filename}" uploaded and processed',
            'chunks_added': len(chunks),
            'total_chunks': len(knowledge_base)
        })
        
    except Exception as e:
        print(f"[RAG] Upload error: {e}")
        return jsonify({'error': str(e)}), 500


@app.route('/', methods=['GET'])
def index():
    """Simple upload page"""
    return '''
    <!DOCTYPE html>
    <html>
    <head>
        <title>Sports Betting RAG - Upload</title>
        <style>
            body { font-family: Arial, sans-serif; max-width: 800px; margin: 50px auto; padding: 20px; background: #1a1a2e; color: #eee; }
            h1 { color: #4ecdc4; }
            .upload-form { background: #16213e; padding: 30px; border-radius: 10px; margin: 20px 0; }
            input[type="file"] { margin: 10px 0; }
            button { background: #4ecdc4; color: #1a1a2e; border: none; padding: 10px 20px; cursor: pointer; border-radius: 5px; font-weight: bold; }
            button:hover { background: #45b7aa; }
            .status { background: #0f3460; padding: 20px; border-radius: 10px; margin: 20px 0; }
            .info { color: #888; font-size: 14px; }
            #result { margin-top: 20px; padding: 15px; border-radius: 5px; display: none; }
            .success { background: #1e5631; }
            .error { background: #5c1a1a; }
        </style>
    </head>
    <body>
        <h1>🏆 Sports Betting RAG</h1>
        <div class="status" id="status">Loading status...</div>
        
        <div class="upload-form">
            <h2>📤 Upload Document</h2>
            <p class="info">Upload PDF or TXT files to add to the knowledge base.</p>
            <form id="uploadForm" enctype="multipart/form-data">
                <input type="file" id="fileInput" name="file" accept=".pdf,.txt" required>
                <br><br>
                <input type="password" id="apiKey" placeholder="API Key" required style="padding: 8px; width: 200px;">
                <br><br>
                <button type="submit">Upload & Process</button>
            </form>
            <div id="result"></div>
        </div>
        
        <script>
            // Load status
            fetch('/status')
                .then(r => r.json())
                .then(data => {
                    document.getElementById('status').innerHTML = `
                        <strong>📊 Status:</strong><br>
                        Total Documents: ${data.total_documents}<br>
                        PDF Directory: ${data.pdf_dir}<br>
                        Has PyMuPDF: ${data.has_pymupdf ? '✅' : '❌'}<br>
                        Has NumPy: ${data.has_numpy ? '✅' : '❌'}
                    `;
                });
            
            // Handle upload
            document.getElementById('uploadForm').onsubmit = async (e) => {
                e.preventDefault();
                const formData = new FormData();
                formData.append('file', document.getElementById('fileInput').files[0]);
                
                const result = document.getElementById('result');
                result.style.display = 'block';
                result.className = '';
                result.textContent = 'Uploading...';
                
                try {
                    const resp = await fetch('/upload', {
                        method: 'POST',
                        headers: { 'X-API-Key': document.getElementById('apiKey').value },
                        body: formData
                    });
                    const data = await resp.json();
                    
                    if (resp.ok) {
                        result.className = 'success';
                        result.textContent = `✅ ${data.message} (${data.chunks_added} chunks added, ${data.total_chunks} total)`;
                        // Reload status
                        location.reload();
                    } else {
                        result.className = 'error';
                        result.textContent = `❌ Error: ${data.error}`;
                    }
                } catch (err) {
                    result.className = 'error';
                    result.textContent = `❌ Error: ${err.message}`;
                }
            };
        </script>
    </body>
    </html>
    '''


@app.route('/ask', methods=['POST'])
@require_api_key
def ask():
    """Answer questions using RAG"""
    try:
        data = request.get_json()
        question = data.get('question', '').strip()
        k = data.get('k', 5)
        
        if not question:
            return jsonify({'error': 'Question is required'}), 400
        
        # Search for relevant chunks
        results = search_knowledge(question, k=k)
        
        if not results:
            return jsonify({
                'answer': 'No relevant information found in the knowledge base.',
                'sources': []
            })
        
        # Format answer from top results
        context = '\n\n'.join([r['content'] for r in results[:3]])
        
        # Simple answer generation (could use Grok/OpenAI for better answers)
        answer = f"Based on sports betting knowledge:\n\n{context}"
        
        sources = [{
            'source': r['source'],
            'preview': r['content'][:200] + '...' if len(r['content']) > 200 else r['content'],
            'score': r['score']
        } for r in results]
        
        return jsonify({
            'answer': answer,
            'sources': sources
        })
        
    except Exception as e:
        print(f"[RAG] Error in /ask: {e}")
        return jsonify({'error': str(e)}), 500


@app.route('/add-insight', methods=['POST'])
@require_api_key
def add_insight():
    """Add new insight to knowledge base"""
    global knowledge_base, knowledge_embeddings
    
    try:
        data = request.get_json()
        content = data.get('content', '').strip()
        metadata = data.get('metadata', {})
        source = data.get('source', 'api_insight')
        
        if not content:
            return jsonify({'error': 'Content is required'}), 400
        
        # Create new chunk
        chunk = {
            'content': content,
            'source': source,
            'metadata': metadata,
            'added': time.time()
        }
        
        # Generate embedding
        embedding = get_embedding(content)
        
        # Add to knowledge base
        knowledge_base.append(chunk)
        knowledge_embeddings.append(embedding)
        
        # Update cache
        cache_file = RAG_CACHE_DIR / 'knowledge_cache.json'
        try:
            with open(cache_file, 'w') as f:
                json.dump({
                    'chunks': knowledge_base,
                    'embeddings': knowledge_embeddings,
                    'updated': time.time()
                }, f)
        except Exception as e:
            print(f"[RAG] Cache update failed: {e}")
        
        return jsonify({
            'success': True,
            'message': 'Insight added to knowledge base',
            'total_chunks': len(knowledge_base)
        })
        
    except Exception as e:
        print(f"[RAG] Error in /add-insight: {e}")
        return jsonify({'error': str(e)}), 500


@app.route('/status', methods=['GET'])
def status():
    """Get RAG system status"""
    return jsonify({
        'total_documents': len(knowledge_base),
        'embeddings_loaded': len(knowledge_embeddings),
        'pdf_dir': str(PDF_DIR),
        'cache_dir': str(RAG_CACHE_DIR),
        'has_pymupdf': HAS_PYMUPDF,
        'has_numpy': HAS_NUMPY
    })


@app.route('/reload', methods=['POST'])
@require_api_key
def reload():
    """Reload knowledge base from PDFs"""
    try:
        # Clear cache
        cache_file = RAG_CACHE_DIR / 'knowledge_cache.json'
        if cache_file.exists():
            cache_file.unlink()
        
        # Reload
        load_knowledge_base()
        
        return jsonify({
            'success': True,
            'chunks_loaded': len(knowledge_base)
        })
    except Exception as e:
        return jsonify({'error': str(e)}), 500


# Initialize on startup
print("[RAG] Initializing Sports Betting RAG Service...")
load_knowledge_base()
print(f"[RAG] Ready with {len(knowledge_base)} knowledge chunks")


if __name__ == '__main__':
    port = int(os.environ.get('RAG_PORT', 5000))
    print(f"[RAG] Starting on port {port}")
    app.run(host='0.0.0.0', port=port, debug=False)