import os
from pathlib import Path

# Paths
BASE_DIR = Path(__file__).parent
PDF_DIR = BASE_DIR / "pdf_directory"
PROCESSED_DIR = BASE_DIR / "processed"
EMBEDDINGS_DIR = PROCESSED_DIR / "embeddings"
DOCUMENTS_DIR = PROCESSED_DIR / "documents"
METADATA_DIR = PROCESSED_DIR / "metadata"
NOTES_DIR = PROCESSED_DIR / "notes"
DIGESTS_DIR = PROCESSED_DIR / "digests"
CAPSULES_DIR = PROCESSED_DIR / "capsules"

# Create directories
for dir_path in [PDF_DIR, EMBEDDINGS_DIR, DOCUMENTS_DIR, METADATA_DIR, NOTES_DIR, DIGESTS_DIR, CAPSULES_DIR]:
    dir_path.mkdir(parents=True, exist_ok=True)

# Processing settings - OPTIMIZED FOR PRODUCTION
CHUNK_SIZE = 1000  # Target chunk size in characters (default)
CHUNK_OVERLAP = 200  # Overlap in characters (will be converted to sentences)
CHUNK_MIN_SIZE = 400  # Minimum chunk size (reject smaller)
CHUNK_MAX_SIZE = 1600  # Maximum chunk size
SENTENCE_OVERLAP = 2  # Number of sentences to overlap between chunks
MIN_SENTENCES_PER_CHUNK = 3  # Minimum sentences required per chunk

# Document type-specific chunking parameters
DOCUMENT_TYPE_CONFIGS = {
    'technical': {
        'chunk_size': 800,  # Smaller chunks for dense technical content
        'chunk_min_size': 400,
        'chunk_max_size': 1200,
        'sentence_overlap': 1,  # Less overlap for technical specs
        'min_sentences': 2,  # Technical content can be shorter
    },
    'research': {
        'chunk_size': 1000,  # Standard size for research papers
        'chunk_min_size': 500,
        'chunk_max_size': 1500,
        'sentence_overlap': 2,  # More overlap for narrative flow
        'min_sentences': 3,
    },
    'legal': {
        'chunk_size': 900,  # Slightly smaller for precise legal text
        'chunk_min_size': 500,
        'chunk_max_size': 1400,
        'sentence_overlap': 2,  # Preserve context in legal documents
        'min_sentences': 3,
    },
    'manual': {
        'chunk_size': 1200,  # Larger chunks for procedural content
        'chunk_min_size': 400,
        'chunk_max_size': 1800,
        'sentence_overlap': 2,  # Maintain procedural continuity
        'min_sentences': 3,
    },
    'default': {
        'chunk_size': 1000,
        'chunk_min_size': 400,
        'chunk_max_size': 1600,
        'sentence_overlap': 2,
        'min_sentences': 3,
    }
}

EMBEDDING_MODEL = "BAAI/bge-base-en-v1.5"  # Upgraded from all-MiniLM-L6-v2 for better technical content understanding

# DEPRECATED: Semantic chunking disabled in favor of sentence-based chunking
SEMANTIC_CHUNKING_ENABLED = False  # Disabled - using sentence-based chunking instead
SEMANTIC_MAX_CHARS = 1800
SEMANTIC_SIMILARITY_THRESHOLD = 0.58

# Extensive keyword taxonomy for knowledge card search across all domains
# Organized by domain for comprehensive coverage

TRADING_KEYWORD_TAXONOMY = {
    # Core Trading Concepts
    'trading_concepts': [
        'trading', 'trade', 'trader', 'trading strategy', 'trading system', 'trading plan',
        'buy', 'sell', 'long', 'short', 'position', 'entry', 'exit', 'signal', 'indicator',
        'technical analysis', 'fundamental analysis', 'quantitative analysis', 'sentiment analysis',
        'risk management', 'position sizing', 'stop loss', 'take profit', 'trailing stop',
        'volatility', 'liquidity', 'slippage', 'spread', 'commission', 'fees', 'costs',
        'backtesting', 'forward testing', 'paper trading', 'live trading', 'simulation',
        'performance', 'returns', 'profit', 'loss', 'pnl', 'drawdown', 'sharpe ratio',
        'sortino ratio', 'maximum drawdown', 'win rate', 'profit factor', 'expectancy'
    ],

    # Sports Betting Keywords
    'sports_betting': [
        'sports betting', 'sportsbook', 'odds', 'line', 'spread', 'moneyline', 'over/under',
        'point spread', 'vig', 'juice', 'bookmaker', 'bookie', 'bet', 'wager', 'stake',
        'payout', 'bankroll', 'bankroll management', 'value bet', 'edge', 'advantage',
        'handicap', 'asian handicap', 'double chance', 'draw no bet', 'accumulator',
        'parlay', 'round robin', 'teaser', 'pleaser', 'if bet', 'reverse bet',
        'arbitrage', 'sure bet', 'hedge', 'scalping', 'middle', 'live betting',
        'in-play', 'cash out', 'early payout', 'betting exchange', 'matched betting',
        'dutching', 'dutch', 'kelly criterion', 'martingale', 'fibonacci', 'labouchere',
        'flat betting', 'percentage betting', 'unit betting', 'staking plan'
    ],

    # Financial Markets Keywords
    'stocks_equities': [
        'stock', 'equity', 'share', 'shares', 'stock market', 'equity market', 'nyse',
        'nasdaq', 'london stock exchange', 'tokyo stock exchange', 'hong kong exchange',
        'dividend', 'dividend yield', 'earnings', 'earnings per share', 'eps', 'pe ratio',
        'price to earnings', 'peg ratio', 'price to sales', 'price to book', 'market cap',
        'blue chip', 'growth stock', 'value stock', 'momentum stock', 'cyclical stock',
        'defensive stock', 'income stock', 'ipo', 'initial public offering', 'secondary offering',
        'split', 'reverse split', 'merger', 'acquisition', 'takeover', 'spin off',
        'earnings report', 'quarterly earnings', 'annual report', '10k', '10q', '8k',
        'insider trading', 'insider buying', 'insider selling', 'institutional ownership',
        'short interest', 'short squeeze', 'gamma squeeze', 'options flow', 'dark pool',
        'high frequency trading', 'algorithmic trading', 'quant trading'
    ],

    # Cryptocurrency Keywords
    'cryptocurrency': [
        'cryptocurrency', 'crypto', 'bitcoin', 'btc', 'ethereum', 'eth', 'altcoin',
        'decentralized finance', 'defi', 'non-fungible token', 'nft', 'blockchain',
        'smart contract', 'decentralized exchange', 'dex', 'centralized exchange', 'cex',
        'wallet', 'hot wallet', 'cold wallet', 'hardware wallet', 'software wallet',
        'mining', 'proof of work', 'pow', 'proof of stake', 'pos', 'staking', 'yield farming',
        'liquidity mining', 'impermanent loss', 'slippage', 'gas fee', 'network congestion',
        'bitcoin halving', 'ethereum merge', 'ethereum 2.0', 'layer 1', 'layer 2',
        'scaling solution', 'rollups', 'sidechain', 'cross chain', 'bridge', 'wrapped token',
        'stablecoin', 'tether', 'usdt', 'usdc', 'dai', 'fiat', 'crypto exchange',
        'spot trading', 'futures', 'perpetuals', 'options', 'derivatives', 'leverage',
        'margin trading', 'liquidation', 'short squeeze', 'whale', 'hodl', 'diamond hands',
        'paper hands', 'fomo', 'fud', 'rug pull', 'pump and dump', 'wash trading',
        'market manipulation', 'front running', 'sandwich attack'
    ],

    # Forex Keywords
    'forex_currency': [
        'forex', 'foreign exchange', 'currency trading', 'fx trading', 'currency pair',
        'major pair', 'minor pair', 'exotic pair', 'eur/usd', 'eurusd', 'gbp/usd', 'gbpusd',
        'usd/jpy', 'usdjpy', 'usd/chf', 'usdchf', 'aud/usd', 'audusd', 'usd/cad', 'usdcad',
        'nzd/usd', 'nzdusd', 'eur/gbp', 'eurgbp', 'eur/jpy', 'eurjpy', 'gbp/jpy', 'gbpjpy',
        'chf/jpy', 'chfjpy', 'aud/jpy', 'audjpy', 'cad/jpy', 'cadjpy', 'nzd/jpy', 'nzdjpy',
        'pip', 'pip value', 'lot size', 'standard lot', 'mini lot', 'micro lot', 'nano lot',
        'leverage', 'margin', 'margin call', 'stop out', 'carry trade', 'interest rate differential',
        'central bank', 'fed', 'federal reserve', 'ecb', 'european central bank', 'boj',
        'bank of japan', 'boe', 'bank of england', 'rbnz', 'reserve bank of new zealand',
        'interest rate decision', 'fomc', 'monetary policy', 'quantitative easing', 'qe',
        'tapering', 'inflation', 'cpi', 'ppi', 'employment data', 'non-farm payrolls', 'nfp',
        'economic indicator', 'fundamental analysis', 'technical analysis', 'price action',
        'support and resistance', 'trend line', 'fibonacci retracement', 'fibonacci extension',
        'moving average', 'rsi', 'relative strength index', 'macd', 'bollinger bands',
        'stochastic oscillator', 'ichimoku cloud', 'forex session', 'london session',
        'new york session', 'tokyo session', 'sydney session', 'overlapping session'
    ],

    # Risk Management Keywords
    'risk_management': [
        'risk', 'risk management', 'risk assessment', 'risk mitigation', 'risk control',
        'portfolio risk', 'systemic risk', 'idiosyncratic risk', 'market risk', 'credit risk',
        'liquidity risk', 'operational risk', 'model risk', 'tail risk', 'black swan',
        'value at risk', 'var', 'expected shortfall', 'es', 'stress testing', 'scenario analysis',
        'monte carlo simulation', 'correlation', 'diversification', 'hedging', 'insurance',
        'stop loss', 'trailing stop', 'breakeven stop', 'profit target', 'risk reward ratio',
        'win rate', 'loss rate', 'average win', 'average loss', 'expectancy', 'kelly criterion',
        'optimal f', 'fixed fractional', 'percentage risk', 'dollar risk', 'position sizing',
        'portfolio allocation', 'asset allocation', 'rebalancing', 'tactical allocation',
        'strategic allocation', 'core and satellite', 'barbell strategy', 'bullet strategy',
        'ladder strategy', 'buckets approach', 'liability matching', 'immunization'
    ],

    # Technical Analysis Keywords
    'technical_analysis': [
        'technical analysis', 'technical indicator', 'chart pattern', 'candlestick',
        'price action', 'trend', 'trend following', 'mean reversion', 'breakout',
        'breakdown', 'reversal', 'continuation', 'support', 'resistance', 'pivot point',
        'fibonacci', 'golden ratio', 'elliott wave', 'harmonic pattern', 'gartley',
        'bat pattern', 'butterfly pattern', 'crab pattern', 'shark pattern', 'cypher pattern',
        'moving average', 'simple moving average', 'sma', 'exponential moving average', 'ema',
        'weighted moving average', 'wma', 'hull moving average', 'hma', 'vwap', 'volume weighted',
        'bollinger bands', 'bollinger band', 'keltner channel', 'donchian channel',
        'envelope', 'price channel', 'regression channel', 'trend channel', 'ascending channel',
        'descending channel', 'horizontal channel', 'triangle', 'wedge', 'flag', 'pennant',
        'double top', 'double bottom', 'head and shoulders', 'inverse head and shoulders',
        'cup and handle', 'ascending triangle', 'descending triangle', 'symmetrical triangle',
        'rising wedge', 'falling wedge', 'bullish flag', 'bearish flag', 'bullish pennant',
        'bearish pennant', 'rsi', 'relative strength index', 'stochastic', 'stochastic oscillator',
        'williams %r', 'williams percent r', 'macd', 'moving average convergence divergence',
        'commodity channel index', 'cci', 'momentum', 'rate of change', 'roc', 'williams %r',
        'ultimate oscillator', 'awesome oscillator', 'acceleration deceleration', 'ac',
        'balance of power', 'bop', 'chaikin money flow', 'cmf', 'ease of movement', 'eom',
        'force index', 'money flow index', 'mfi', 'negative volume index', 'nvi',
        'on balance volume', 'obv', 'positive volume index', 'pvi', 'volume oscillator',
        'accumulation distribution', 'a/d line', 'chaikin oscillator', 'market facilitation index',
        'mfi', 'schaff trend cycle', 'stc', 'true strength index', 'tsi'
    ],

    # Fundamental Analysis Keywords
    'fundamental_analysis': [
        'fundamental analysis', 'fundamentals', 'economic data', 'economic indicator',
        'gross domestic product', 'gdp', 'consumer price index', 'cpi', 'producer price index', 'ppi',
        'unemployment rate', 'employment data', 'non-farm payrolls', 'nfp', 'jobless claims',
        'retail sales', 'industrial production', 'capacity utilization', 'housing starts',
        'building permits', 'existing home sales', 'new home sales', 'durable goods orders',
        'factory orders', 'ism manufacturing', 'ism services', 'pmi', 'purchasing managers index',
        'consumer confidence', 'business confidence', 'yield curve', 'treasury yield',
        'federal funds rate', 'discount rate', 'prime rate', 'libor', 'euribor', 'sofr',
        'monetary policy', 'fiscal policy', 'quantitative easing', 'qe', 'tapering',
        'forward guidance', 'central bank', 'federal reserve', 'ecb', 'boe', 'boj',
        'balance of payments', 'trade balance', 'current account', 'capital account',
        'foreign direct investment', 'fdi', 'portfolio investment', 'hot money',
        'currency reserve', 'foreign exchange reserve', 'commodity prices', 'oil price',
        'crude oil', 'brent crude', 'wti', 'gold price', 'silver price', 'copper price',
        'agricultural commodity', 'corn', 'wheat', 'soybean', 'coffee', 'sugar',
        'earnings report', 'earnings season', 'earnings surprise', 'beat estimates',
        'miss estimates', 'guidance', 'revenue guidance', 'earnings guidance',
        'balance sheet', 'income statement', 'cash flow statement', 'financial statement',
        'revenue', 'cost of goods sold', 'cogs', 'gross margin', 'operating margin',
        'net margin', 'return on equity', 'roe', 'return on assets', 'roa', 'return on capital', 'roc',
        'debt to equity', 'leverage ratio', 'current ratio', 'quick ratio', 'working capital',
        'free cash flow', 'fcf', 'earnings per share', 'eps', 'price to earnings', 'pe',
        'price to book', 'pb', 'price to sales', 'ps', 'enterprise value', 'ev', 'ev/ebitda'
    ],

    # Sports-Specific Keywords
    'sports_domains': [
        # Football/Soccer
        'football', 'soccer', 'premier league', 'la liga', 'bundesliga', 'serie a', 'ligue 1',
        'champions league', 'europa league', 'fa cup', 'copa del rey', 'dfb pokal', 'coppa italia',
        'coupe de france', 'transfer', 'transfer window', 'squad rotation', 'starting eleven',
        'formation', '4-4-2', '4-3-3', '3-5-2', 'injury', 'suspension', 'red card', 'yellow card',
        'penalty', 'free kick', 'corner kick', 'throw in', 'offside', 'handball', 'foul',

        # Basketball
        'basketball', 'nba', 'college basketball', 'march madness', 'ncaa tournament',
        'point guard', 'shooting guard', 'small forward', 'power forward', 'center',
        'point spread', 'moneyline', 'over/under', 'first half', 'second half', 'quarter',
        'three pointer', 'free throw', 'rebound', 'assist', 'steal', 'block', 'turnover',
        'pace', 'efficiency', 'true shooting percentage', 'effective field goal percentage',

        # Baseball
        'baseball', 'mlb', 'american league', 'national league', 'world series', 'all star',
        'pitcher', 'catcher', 'infielder', 'outfielder', 'batting average', 'on base percentage',
        'slugging percentage', 'ops', 'era', 'whip', 'strikeout', 'home run', 'rbi',
        'run', 'hit', 'double', 'triple', 'stolen base', 'sacrifice fly', 'ground out',

        # Hockey
        'hockey', 'nhl', 'stanley cup', 'playoffs', 'regular season', 'power play',
        'penalty kill', 'faceoff', 'icing', 'offside', 'goal', 'assist', 'save percentage',
        'goals against average', 'plus/minus', 'corsi', 'fenwick', 'expected goals',

        # Tennis
        'tennis', 'atp', 'wta', 'grand slam', 'wimbledon', 'us open', 'french open', 'australian open',
        'ace', 'double fault', 'break point', 'match point', 'set point', 'tiebreak',
        'serve', 'return', 'forehand', 'backhand', 'volley', 'smash', 'drop shot',

        # Golf
        'golf', 'pga', 'lpga', 'masters', 'us open', 'british open', 'pga championship',
        'birdie', 'bogey', 'eagle', 'albatross', 'par', 'under par', 'over par', 'handicap',
        'fairway', 'rough', 'green', 'bunker', 'water hazard', 'putt', 'drive', 'iron',

        # MMA/UFC
        'mma', 'ufc', 'mixed martial arts', 'knockout', 'submission', 'decision', 'unanimous decision',
        'split decision', 'technical knockout', 'doctor stoppage', 'referee stoppage',
        'ground and pound', 'mount', 'guard', 'rear naked choke', 'armbar', 'triangle choke',
        'kimura', 'americana', 'heel hook', 'ankle lock', 'octagon', 'cage', 'fight card',

        # General Sports
        'home advantage', 'away disadvantage', 'motivation', 'momentum', 'form', 'streak',
        'winning streak', 'losing streak', 'hot streak', 'cold streak', 'rest', 'travel',
        'jet lag', 'weather', 'field conditions', 'referee', 'umpire', 'official', 'var',
        'video assistant referee', 'instant replay', 'coaching', 'tactics', 'strategy'
    ]
}

# Flatten all keywords into a single list for backward compatibility
TECHNICAL_TERMS = []
for domain_keywords in TRADING_KEYWORD_TAXONOMY.values():
    TECHNICAL_TERMS.extend(domain_keywords)

# Synonym mappings for keyword expansion
TRADING_SYNONYMS = {
    # Trading synonyms
    'trading': ['trade', 'trader', 'trading strategy', 'trading system'],
    'buy': ['purchase', 'long', 'go long', 'bullish'],
    'sell': ['short', 'go short', 'bearish'],
    'profit': ['gain', 'earnings', 'returns', 'pnl'],
    'loss': ['drawdown', 'negative return', 'deficit'],

    # Sports betting synonyms
    'odds': ['line', 'spread', 'vig', 'juice'],
    'bet': ['wager', 'stake', 'gamble'],
    'bookmaker': ['bookie', 'sportsbook', 'book'],
    'parlay': ['accumulator', 'combo', 'multi'],
    'hedge': ['insurance', 'protection', 'cover'],

    # Financial market synonyms
    'stock': ['equity', 'share', 'security'],
    'dividend': ['yield', 'payout', 'distribution'],
    'earnings': ['profit', 'revenue', 'income'],
    'ipo': ['initial public offering', 'public offering'],

    # Crypto synonyms
    'bitcoin': ['btc', 'digital gold', 'cryptocurrency'],
    'ethereum': ['eth', 'smart contract platform'],
    'mining': ['proof of work', 'pow', 'hashing'],
    'staking': ['proof of stake', 'pos', 'yield farming'],

    # Forex synonyms
    'forex': ['fx', 'foreign exchange', 'currency trading'],
    'pip': ['percentage in point', 'price interest point'],
    'leverage': ['margin trading', 'gearing'],
    'central bank': ['fed', 'ecb', 'boe', 'boj', 'rbnz'],

    # Technical analysis synonyms
    'support': ['floor', 'bottom', 'demand zone'],
    'resistance': ['ceiling', 'top', 'supply zone'],
    'trend': ['direction', 'momentum', 'bias'],
    'breakout': ['break above', 'break below', 'break through'],
    'reversal': ['turnaround', 'pivot', 'change in direction'],

    # Risk management synonyms
    'stop loss': ['stop order', 'protective stop', 'risk stop'],
    'take profit': ['profit target', 'exit target', 'limit order'],
    'diversification': ['spreading risk', 'asset allocation', 'portfolio mix']
}

# Keyword frequency weights (higher = more important)
KEYWORD_WEIGHTS = {
    # High priority terms
    'trading strategy': 1.0,
    'risk management': 1.0,
    'technical analysis': 0.9,
    'fundamental analysis': 0.9,
    'backtesting': 0.9,
    'performance': 0.8,
    'volatility': 0.8,
    'liquidity': 0.8,

    # Domain-specific high priority
    'parlay': 0.9,
    'round robin': 0.9,
    'kelly criterion': 0.8,
    'arbitrage': 0.8,
    'hedge': 0.8,

    # Medium priority
    'profit': 0.7,
    'loss': 0.7,
    'trend': 0.7,
    'momentum': 0.7,
    'breakout': 0.7,

    # Lower priority (but still relevant)
    'buy': 0.5,
    'sell': 0.5,
    'long': 0.5,
    'short': 0.5,
}

# Domain-specific keyword mappings for filtering
DOMAIN_KEYWORD_MAPPING = {
    'sports': ['sports betting', 'football', 'basketball', 'baseball', 'hockey', 'tennis', 'golf', 'mma', 'odds', 'spread', 'moneyline', 'parlay', 'round robin'],
    'crypto': ['cryptocurrency', 'bitcoin', 'ethereum', 'blockchain', 'mining', 'staking', 'defi', 'nft', 'wallet', 'exchange'],
    'stocks': ['stock', 'equity', 'dividend', 'earnings', 'ipo', 'market cap', 'pe ratio', 'technical analysis', 'fundamental analysis'],
    'forex': ['forex', 'currency', 'pip', 'leverage', 'carry trade', 'central bank', 'economic indicator', 'eurusd', 'gbpusd', 'usdjpy']
}

# Comprehensive keyword list for improved search relevance
# These terms get boosted in search scoring for better matching
# LEGACY TERMS (keeping for backward compatibility)
TECHNICAL_TERMS.extend([
    # Growing/Cultivation (legacy from original cannabis focus)
    'ph', 'ph level', 'ph.', 'ph-', 'ph:', 'ph scale', 'acidity', 'alkalinity', 'acidic', 'alkaline',
    'temperature', 'temp', 'celsius', 'fahrenheit', '°c', '°f', 'degrees', 'warm', 'cool', 'heat',
    'humidity', 'rh', 'relative humidity', 'moisture', 'dry', 'wet',
    'ppm', 'parts per million', 'ec', 'electrical conductivity', 'tds', 'total dissolved solids',
    'nutrient', 'fertilizer', 'npk', 'nitrogen', 'phosphorus', 'potassium', 'macronutrient', 'micronutrient',
    'measurement', 'range', 'level', 'amount', 'concentration', 'quantity',
    'light', 'lumens', 'par', 'ppfd', 'watts', 'led', 'hps', 'mh', 'spectrum', 'photoperiod',
    'soil', 'medium', 'substrate', 'coco', 'perlite', 'vermiculite', 'peat', 'compost',
    'watering', 'irrigation', 'drainage', 'runoff', 'flood', 'drip', 'spray',
    'hydroponic', 'hydroponics', 'hydro', 'soilless', 'deep water culture', 'dwc', 'nft', 'aeroponic', 'ebb and flow',
    'growing', 'cultivation', 'cultivate', 'grow', 'plant', 'crop', 'harvest',
    'seed', 'seedling', 'clone', 'cutting', 'propagation', 'germination',
    'vegetative', 'veg', 'flowering', 'flower', 'bud', 'trichome', 'pistil', 'stamen',
    'curing', 'drying', 'trimming', 'pruning', 'topping', 'fimming', 'lst', 'hst',
    'pest', 'disease', 'mold', 'mildew', 'spider mite', 'aphid', 'fungus', 'bacteria',
    'ventilation', 'airflow', 'fan', 'exhaust', 'intake', 'carbon filter',
    'indoor', 'outdoor', 'greenhouse', 'grow room', 'grow tent',
    
    # Cannabis-Specific
    'cannabis', 'marijuana', 'hemp', 'weed', 'pot', 'ganja', 'bud', 'flower',
    'strain', 'cultivar', 'variety', 'genetics', 'phenotype', 'genotype',
    'indica', 'sativa', 'hybrid', 'ruderalis', 'autoflower', 'auto',
    'thc', 'cbd', 'cbg', 'cbn', 'cbc', 'thca', 'cbda', 'cannabinoid', 'cannabinoids',
    'terpene', 'terpenes', 'terp', 'myrcene', 'limonene', 'pinene', 'linalool', 'caryophyllene',
    'cannabinoid profile', 'terpene profile', 'thc content', 'cbd content', 'potency',
    'extraction', 'extract', 'concentrate', 'distillate', 'isolate', 'full spectrum', 'broad spectrum',
    'rosin', 'hash', 'kief', 'bubble hash', 'dry sift', 'live resin', 'shatter', 'wax', 'budder',
    'co2 extraction', 'solvent extraction', 'butane', 'propane', 'ethanol', 'hydrocarbon',
    'edible', 'tincture', 'topical', 'capsule', 'vape', 'cartridge', 'dab', 'dabbing',
    'smoking', 'vaping', 'ingestion', 'sublingual', 'transdermal',
    
    # Health/Medical
    'health', 'healthy', 'wellness', 'wellbeing', 'therapeutic', 'therapeutic dose', 'medical', 'medicinal', 'medicine',
    'treatment', 'therapy', 'therapeutic use', 'medical use', 'medicinal use',
    'dosage', 'dose', 'dosing', 'mg', 'milligram', 'microgram', 'mcg', 'ml', 'milliliter', 'gram', 'g',
    'mg/kg', 'bioavailability', 'half-life', 'metabolism', 'metabolite', 'pharmacokinetics',
    'pain', 'pain relief', 'analgesic', 'chronic pain', 'neuropathic pain',
    'anxiety', 'anxiety relief', 'stress', 'stress relief', 'relaxation', 'calm',
    'inflammation', 'anti-inflammatory', 'inflammatory', 'swelling',
    'seizure', 'epilepsy', 'epileptic', 'anticonvulsant', 'dravet', 'lennox-gastaut',
    'nausea', 'vomiting', 'appetite', 'appetite stimulant', 'cachexia',
    'sleep', 'insomnia', 'sleep aid', 'sedative', 'sleep quality',
    'depression', 'mood', 'mood disorder', 'bipolar', 'ptsd', 'trauma',
    'cancer', 'tumor', 'chemotherapy', 'cancer treatment', 'oncology',
    'glaucoma', 'eye pressure', 'intraocular pressure',
    'multiple sclerosis', 'ms', 'spasticity', 'muscle spasm',
    'parkinson', 'tremor', 'movement disorder',
    'alzheimer', 'dementia', 'cognitive', 'memory',
    'autism', 'autism spectrum', 'asd',
    'adhd', 'attention deficit', 'hyperactivity',
    'addiction', 'substance abuse', 'withdrawal', 'dependence',
    'side effect', 'adverse effect', 'contraindication', 'interaction', 'drug interaction',
    'safety', 'safe', 'toxicity', 'toxic', 'overdose', 'lethal dose',
    'clinical trial', 'study', 'research', 'peer-reviewed', 'evidence', 'efficacy', 'effective',
    'patient', 'symptom', 'condition', 'disease', 'disorder', 'syndrome',
    'doctor', 'physician', 'medical professional', 'prescription', 'recommendation',
    
    # Law/Legal
    'law', 'legal', 'illegal', 'legality', 'legalization', 'legalized', 'decriminalization', 'decriminalized',
    'regulation', 'regulated', 'regulatory', 'compliance', 'compliant',
    'license', 'licensing', 'permit', 'permission', 'authorization',
    'jurisdiction', 'state law', 'federal law', 'local law', 'municipal', 'city ordinance',
    'state', 'federal', 'federally', 'government', 'govt',
    'dea', 'drug enforcement administration', 'fda', 'food and drug administration', 'atf', 'bureau of alcohol tobacco firearms',
    'possession', 'possession limit', 'possession limit', 'carry', 'transport',
    'grams', 'ounces', 'pounds', 'oz', 'lb', 'kg', 'kilogram',
    'criminal', 'crime', 'criminalize', 'misdemeanor', 'felony', 'infraction',
    'penalty', 'fine', 'sentence', 'jail', 'prison', 'incarceration',
    'legislation', 'bill', 'act', 'statute', 'ordinance', 'law', 'rule',
    'prohibition', 'prohibited', 'ban', 'banned', 'restriction', 'restricted',
    'tax', 'taxation', 'taxed', 'excise tax', 'sales tax',
    'dispensary', 'retail', 'retailer', 'store', 'shop',
    'cultivation', 'cultivation license', 'grow license', 'growing permit',
    'distribution', 'distributor', 'wholesale', 'wholesaler',
    'testing', 'lab testing', 'laboratory', 'quality control', 'qc',
    'packaging', 'labeling', 'label', 'warning', 'child-resistant',
    
    # Business/Economics
    'business', 'business owner', 'entrepreneur', 'entrepreneurship', 'startup', 'company', 'corporation',
    'revenue', 'profit', 'margin', 'profit margin', 'earnings', 'income',
    'cost', 'price', 'pricing', 'wholesale price', 'retail price', 'market price',
    'market', 'market size', 'market share', 'market analysis', 'market research',
    'valuation', 'value', 'worth', 'investment', 'invest', 'investor', 'roi', 'return on investment',
    'sales', 'revenue', 'cagr', 'growth rate', 'growth', 'expanding', 'expansion',
    'supply', 'demand', 'supply and demand', 'price elasticity', 'supply chain', 'distribution',
    'business model', 'revenue model', 'pricing strategy', 'market entry', 'entry strategy',
    'competition', 'competitive', 'competitor', 'market segment', 'target market', 'niche',
    'financial', 'finance', 'funding', 'capital', 'equity', 'debt', 'loan', 'credit',
    'partnership', 'partner', 'joint venture', 'merger', 'acquisition',
    'franchise', 'franchising', 'franchisee', 'franchisor',
    'marketing', 'advertising', 'promotion', 'brand', 'branding',
    'customer', 'client', 'consumer', 'user', 'patient',
    
    # Science/Research
    'science', 'scientific', 'research', 'researcher', 'researcher', 'study', 'studies',
    'clinical trial', 'trial', 'peer-reviewed', 'peer review', 'journal', 'publication',
    'methodology', 'method', 'procedure', 'protocol', 'experiment', 'experimental',
    'data', 'dataset', 'results', 'findings', 'conclusion', 'hypothesis', 'theory',
    'compound', 'molecule', 'chemical', 'chemistry', 'biochemistry',
    'extraction', 'distillation', 'chromatography', 'purification', 'refinement',
    'concentration', 'percentage', '%', 'ratio', 'proportion', 'percentage',
    'analysis', 'analyze', 'analytical', 'measurement', 'quantify', 'qualify', 'quantitative', 'qualitative',
    'statistical', 'statistics', 'statistically', 'significant', 'significance', 'p-value', 'correlation',
    'laboratory', 'lab', 'testing', 'test', 'assay', 'sample', 'specimen',
    'evidence', 'evidence-based', 'empirical', 'empirically', 'proven', 'proof',
    'biology', 'biological', 'botany', 'botanical', 'plant science', 'horticulture', 'horticultural',
    'genetics', 'genetic', 'genome', 'dna', 'rna', 'gene', 'genetic modification', 'gm',
    'pharmacology', 'pharmacological', 'pharmacokinetic', 'pharmacodynamic',
    'neuroscience', 'neurological', 'brain', 'neural', 'neuron', 'neurotransmitter',
    'endocannabinoid', 'endocannabinoid system', 'ecs', 'receptor', 'cb1', 'cb2',
    
    # Psychology/Mental Health
    'psychology', 'psychological', 'psychologist', 'mental health', 'mental illness',
    'depression', 'depressive', 'depressed', 'mood', 'mood disorder', 'bipolar', 'manic',
    'anxiety', 'anxious', 'panic', 'panic attack', 'phobia', 'social anxiety',
    'ptsd', 'post-traumatic stress', 'trauma', 'traumatic', 'stress', 'stressed',
    'adhd', 'attention deficit', 'hyperactivity', 'hyperactive', 'focus', 'concentration',
    'autism', 'autistic', 'autism spectrum', 'asd', 'asperger',
    'addiction', 'addictive', 'substance abuse', 'dependence', 'dependent', 'withdrawal',
    'cognitive', 'cognition', 'memory', 'recall', 'learning', 'brain function',
    'behavior', 'behavioral', 'behavioral therapy', 'cbt', 'cognitive behavioral therapy',
    'therapy', 'therapist', 'counseling', 'counselor', 'psychotherapy',
    'wellbeing', 'wellness', 'mental wellness', 'emotional', 'emotion',
    'mindfulness', 'meditation', 'relaxation', 'calm', 'peaceful',
    
    # General Important Terms
    'optimal', 'optimum', 'ideal', 'best', 'recommended', 'suggested',
    'effective', 'efficacy', 'efficient', 'efficiency',
    'safe', 'safety', 'risk', 'risky', 'dangerous', 'harmful',
    'benefit', 'beneficial', 'advantage', 'advantageous',
    'quality', 'high quality', 'premium', 'grade', 'grade a',
    'organic', 'organically', 'natural', 'naturally', 'synthetic', 'artificial'
])

# Synonym map for query expansion
SYNONYMS = {
    'cannabis': ['marijuana', 'weed', 'hemp', 'pot', 'ganja'],
    'marijuana': ['cannabis', 'weed', 'hemp', 'pot', 'ganja'],
    'weed': ['cannabis', 'marijuana', 'hemp', 'pot'],
    'growing': ['cultivation', 'farming', 'planting'],
    'cultivation': ['growing', 'farming', 'planting'],
    'hydroponics': ['hydro', 'soilless', 'dwc', 'nft'],
    'hydro': ['hydroponics'],
    'lighting': ['lights', 'lamps', 'led', 'hps'],
    'nutrients': ['fertilizer', 'food', 'feeding'],
    'fertilizer': ['nutrients', 'food'],
    'soil': ['medium', 'dirt', 'substrate', 'earth'],
    'medical': ['medicinal', 'therapeutic', 'health'],
    'legal': ['law', 'legislation', 'regulation'],
    'law': ['legal', 'legislation', 'statute'],
    'business': ['industry', 'market', 'commercial'],
    'yield': ['harvest', 'production', 'output'],
    'extract': ['concentrate', 'oil', 'wax', 'shatter'],
    'thc': ['potency', 'psychoactive'],
    'cbd': ['cannabidiol', 'non-psychoactive']
}

# Measurement patterns
MEASUREMENT_PATTERNS = [
    # Growing/Cultivation
    r'ph\s*[:\-]?\s*(\d+\.?\d*)\s*[-–—to]\s*(\d+\.?\d*)',  # pH 6.5-8.0
    r'(\d+\.?\d*)\s*[-–—to]\s*(\d+\.?\d*)\s*ph',  # 6.5-8.0 pH
    r'(\d+\.?\d*)\s*ppm',  # 500 ppm
    r'(\d+\.?\d*)\s*°[cf]',  # 75°F
    r'(\d+\.?\d*)\s*percent',  # 50 percent
    r'(\d+\.?\d*)\s*%',  # 50%
    
    # Health/Medical
    r'(\d+\.?\d*)\s*mg',  # 10 mg
    r'(\d+\.?\d*)\s*milligram',  # 10 milligram
    r'(\d+\.?\d*)\s*mcg',  # 5 mcg
    r'(\d+\.?\d*)\s*ml',  # 2.5 ml
    r'(\d+\.?\d*)\s*mg/kg',  # 1 mg/kg
    r'(\d+\.?\d*)\s*mg\s*[-–—to]\s*(\d+\.?\d*)\s*mg',  # 5-10 mg
    
    # Law/Legal
    r'(\d+\.?\d*)\s*grams',  # 28 grams
    r'(\d+\.?\d*)\s*ounces',  # 1 ounce
    r'(\d+\.?\d*)\s*pounds',  # 1 pound
    
    # Business/Economics
    r'\$(\d+[,\d]*\.?\d*)\s*(million|billion|thousand)',  # $1.5 billion
    r'(\d+\.?\d*)\s*%?\s*(roi|margin|growth)',  # 15% ROI
    r'(\d+\.?\d*)\s*percent\s*(market|share)',  # 25% market share
]

# Language model settings (using a smaller, accessible model for CPU)
LANGUAGE_MODEL = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"  # Distilled DeepSeek reasoning model
MAX_NEW_TOKENS = 256
TEMPERATURE = 0.2
DO_SAMPLE = False

# Knowledge card preprocessing settings
KNOWLEDGE_CARD_LLM_BUDGET = 999999  # Maximum number of chunks to refine with the LLM (set high to process all chunks)

# Hybrid retrieval settings
HYBRID_SEARCH_OVERSAMPLE = 4
HYBRID_VECTOR_WEIGHT = 0.50  # Reduced to give more weight to keywords for better relevance
HYBRID_KEYWORD_WEIGHT = 0.50  # Increased significantly to prioritize keyword matches
HYBRID_MIN_KEYWORD_SCORE = 0.05  # Lowered to allow more results through
HYBRID_BM25_WEIGHT = 0.25  # Additional weight for BM25 field scoring
MAX_CHUNKS_PER_SOURCE = 2  # Limit per source to increase variety in results
MIN_CHUNK_SCORE = 0.5  # Minimum combined score threshold (filter low-quality chunks)
DEFAULT_RETRIEVAL_K = 10  # Default number of chunks to retrieve (increased from 5)

# Enrichment settings
ENRICHMENT_DEFAULT_THREADS = 2
ENRICHMENT_MAX_THREADS = 6

# Enrichment logging settings
ENRICHMENT_STATUS_PATH = Path(PROCESSED_DIR) / "metadata" / "enrichment_status.json"
ENRICHMENT_LOG_PATH = Path(PROCESSED_DIR) / "metadata" / "enrichment_log.jsonl"

# Web interface settings
WEB_HOST = "0.0.0.0"
WEB_PORT = 5000
