import os
from pathlib import Path
from dataclasses import dataclass
from typing import Literal, Optional
import torch

@dataclass
class ProcessingConfig:
    # Environment detection
    device: str = "auto"
    use_gpu: bool = True
    use_quantization: bool = True

    # Embedding optimization
    backend: Literal["onnx", "openvino", "pytorch"] = "auto"
    precision: Literal["fp32", "fp16", "int8"] = "auto"
    batch_size: int = 64
    max_workers: int = 4

    # FAISS optimization
    faiss_index_type: str = "IVF_FLAT"
    faiss_quantization: str = "PQ"  # or "SQ" for CPU
    n_probe: int = 10
    n_list: int = 100

    # I/O optimization
    use_async_io: bool = True
    use_parquet: bool = True
    compression: bool = True

    def __post_init__(self):
        if self.device == "auto":
            self.device = "cuda" if torch.cuda.is_available() and self.use_gpu else "cpu"

        if self.precision == "auto":
            self.precision = "fp16" if self.device == "cuda" else "int8"

        if self.backend == "auto":
            if self.device == "cuda":
                self.backend = "pytorch"
            else:
                self.backend = "onnx"  # or "openvino" based on benchmarks

# Paths
BASE_DIR = Path(__file__).parent
PDF_DIR = BASE_DIR / "pdf_directory"
PROCESSED_DIR = BASE_DIR / "processed"
EMBEDDINGS_DIR = PROCESSED_DIR / "embeddings"
DOCUMENTS_DIR = PROCESSED_DIR / "documents"
METADATA_DIR = PROCESSED_DIR / "metadata"
NOTES_DIR = PROCESSED_DIR / "notes"
DIGESTS_DIR = PROCESSED_DIR / "digests"
CAPSULES_DIR = PROCESSED_DIR / "capsules"
MEMORY_DIR = PROCESSED_DIR / "memory"

# Create directories
for dir_path in [PDF_DIR, EMBEDDINGS_DIR, DOCUMENTS_DIR, METADATA_DIR, NOTES_DIR, DIGESTS_DIR, CAPSULES_DIR, MEMORY_DIR]:
    dir_path.mkdir(parents=True, exist_ok=True)

# Processing settings - OPTIMIZED FOR PRODUCTION
CHUNK_SIZE = 1000  # Target chunk size in characters (default)
CHUNK_OVERLAP = 200  # Overlap in characters (will be converted to sentences)
CHUNK_MIN_SIZE = 400  # Minimum chunk size (reject smaller)
CHUNK_MAX_SIZE = 1600  # Maximum chunk size
SENTENCE_OVERLAP = 2  # Number of sentences to overlap between chunks
MIN_SENTENCES_PER_CHUNK = 3  # Minimum sentences required per chunk

# Document type-specific chunking parameters
DOCUMENT_TYPE_CONFIGS = {
    'technical': {
        'chunk_size': 800,  # Smaller chunks for dense technical content
        'chunk_min_size': 400,
        'chunk_max_size': 1200,
        'sentence_overlap': 1,  # Less overlap for technical specs
        'min_sentences': 2,  # Technical content can be shorter
    },
    'research': {
        'chunk_size': 1000,  # Standard size for research papers
        'chunk_min_size': 500,
        'chunk_max_size': 1500,
        'sentence_overlap': 2,  # More overlap for narrative flow
        'min_sentences': 3,
    },
    'legal': {
        'chunk_size': 900,  # Slightly smaller for precise legal text
        'chunk_min_size': 500,
        'chunk_max_size': 1400,
        'sentence_overlap': 2,  # Preserve context in legal documents
        'min_sentences': 3,
    },
    'manual': {
        'chunk_size': 1200,  # Larger chunks for procedural content
        'chunk_min_size': 400,
        'chunk_max_size': 1800,
        'sentence_overlap': 2,  # Maintain procedural continuity
        'min_sentences': 3,
    },
    'default': {
        'chunk_size': 1000,
        'chunk_min_size': 400,
        'chunk_max_size': 1600,
        'sentence_overlap': 2,
        'min_sentences': 3,
    }
}

EMBEDDING_MODEL = "BAAI/bge-small-en-v1.5"  # 384 dimensions to match existing FAISS index

# DEPRECATED: Semantic chunking disabled in favor of sentence-based chunking
SEMANTIC_CHUNKING_ENABLED = False  # Disabled - using sentence-based chunking instead
SEMANTIC_MAX_CHARS = 1800
SEMANTIC_SIMILARITY_THRESHOLD = 0.58

# Comprehensive keyword list for improved search relevance
# These terms get boosted in search scoring for better matching
TECHNICAL_TERMS = [
    # Growing/Cultivation
    'ph', 'ph level', 'ph.', 'ph-', 'ph:', 'ph scale', 'acidity', 'alkalinity', 'acidic', 'alkaline',
    'temperature', 'temp', 'celsius', 'fahrenheit', '°c', '°f', 'degrees', 'warm', 'cool', 'heat',
    'humidity', 'rh', 'relative humidity', 'moisture', 'dry', 'wet',
    'ppm', 'parts per million', 'ec', 'electrical conductivity', 'tds', 'total dissolved solids',
    'nutrient', 'fertilizer', 'npk', 'nitrogen', 'phosphorus', 'potassium', 'macronutrient', 'micronutrient',
    'measurement', 'range', 'level', 'amount', 'concentration', 'quantity',
    'light', 'lumens', 'par', 'ppfd', 'watts', 'led', 'hps', 'mh', 'spectrum', 'photoperiod',
    'soil', 'medium', 'substrate', 'coco', 'perlite', 'vermiculite', 'peat', 'compost',
    'watering', 'irrigation', 'drainage', 'runoff', 'flood', 'drip', 'spray',
    'hydroponic', 'hydroponics', 'hydro', 'soilless', 'deep water culture', 'dwc', 'nft', 'aeroponic', 'ebb and flow',
    'growing', 'cultivation', 'cultivate', 'grow', 'plant', 'crop', 'harvest',
    'seed', 'seedling', 'clone', 'cutting', 'propagation', 'germination',
    'vegetative', 'veg', 'flowering', 'flower', 'bud', 'trichome', 'pistil', 'stamen',
    'curing', 'drying', 'trimming', 'pruning', 'topping', 'fimming', 'lst', 'hst',
    'pest', 'disease', 'mold', 'mildew', 'spider mite', 'aphid', 'fungus', 'bacteria',
    'ventilation', 'airflow', 'fan', 'exhaust', 'intake', 'carbon filter',
    'indoor', 'outdoor', 'greenhouse', 'grow room', 'grow tent',

    # Cannabis-Specific
    'cannabis', 'marijuana', 'hemp', 'weed', 'pot', 'ganja', 'bud', 'flower',
    'strain', 'cultivar', 'variety', 'genetics', 'phenotype', 'genotype',
    'indica', 'sativa', 'hybrid', 'ruderalis', 'autoflower', 'auto',
    'thc', 'cbd', 'cbg', 'cbn', 'cbc', 'thca', 'cbda', 'cannabinoid', 'cannabinoids',
    'terpene', 'terpenes', 'terp', 'myrcene', 'limonene', 'pinene', 'linalool', 'caryophyllene',
    'cannabinoid profile', 'terpene profile', 'thc content', 'cbd content', 'potency',
    'extraction', 'extract', 'concentrate', 'distillate', 'isolate', 'full spectrum', 'broad spectrum',
    'rosin', 'hash', 'kief', 'bubble hash', 'dry sift', 'live resin', 'shatter', 'wax', 'budder',
    'co2 extraction', 'solvent extraction', 'butane', 'propane', 'ethanol', 'hydrocarbon',
    'edible', 'tincture', 'topical', 'capsule', 'vape', 'cartridge', 'dab', 'dabbing',
    'smoking', 'vaping', 'ingestion', 'sublingual', 'transdermal',

    # Health/Medical
    'health', 'healthy', 'wellness', 'wellbeing', 'therapeutic', 'therapeutic dose', 'medical', 'medicinal', 'medicine',
    'treatment', 'therapy', 'therapeutic use', 'medical use', 'medicinal use',
    'dosage', 'dose', 'dosing', 'mg', 'milligram', 'microgram', 'mcg', 'ml', 'milliliter', 'gram', 'g',
    'mg/kg', 'bioavailability', 'half-life', 'metabolism', 'metabolite', 'pharmacokinetics',
    'pain', 'pain relief', 'analgesic', 'chronic pain', 'neuropathic pain',
    'anxiety', 'anxiety relief', 'stress', 'stress relief', 'relaxation', 'calm',
    'inflammation', 'anti-inflammatory', 'inflammatory', 'swelling',
    'seizure', 'epilepsy', 'epileptic', 'anticonvulsant', 'dravet', 'lennox-gastaut',
    'nausea', 'vomiting', 'appetite', 'appetite stimulant', 'cachexia',
    'sleep', 'insomnia', 'sleep aid', 'sedative', 'sleep quality',
    'depression', 'mood', 'mood disorder', 'bipolar', 'ptsd', 'trauma',
    'cancer', 'tumor', 'chemotherapy', 'cancer treatment', 'oncology',
    'glaucoma', 'eye pressure', 'intraocular pressure',
    'multiple sclerosis', 'ms', 'spasticity', 'muscle spasm',
    'parkinson', 'tremor', 'movement disorder',
    'alzheimer', 'dementia', 'cognitive', 'memory',
    'autism', 'autism spectrum', 'asd',
    'adhd', 'attention deficit', 'hyperactivity',
    'addiction', 'substance abuse', 'withdrawal', 'dependence',
    'side effect', 'adverse effect', 'contraindication', 'interaction', 'drug interaction',
    'safety', 'safe', 'toxicity', 'toxic', 'overdose', 'lethal dose',
    'clinical trial', 'study', 'research', 'peer-reviewed', 'evidence', 'efficacy', 'effective',
    'patient', 'symptom', 'condition', 'disease', 'disorder', 'syndrome',
    'doctor', 'physician', 'medical professional', 'prescription', 'recommendation',

    # Law/Legal
    'law', 'legal', 'illegal', 'legality', 'legalization', 'legalized', 'decriminalization', 'decriminalized',
    'regulation', 'regulated', 'regulatory', 'compliance', 'compliant',
    'license', 'licensing', 'permit', 'permission', 'authorization',
    'jurisdiction', 'state law', 'federal law', 'local law', 'municipal', 'city ordinance',
    'state', 'federal', 'federally', 'government', 'govt',
    'dea', 'drug enforcement administration', 'fda', 'food and drug administration', 'atf', 'bureau of alcohol tobacco firearms',
    'possession', 'possession limit', 'possession limit', 'carry', 'transport',
    'grams', 'ounces', 'pounds', 'oz', 'lb', 'kg', 'kilogram',
    'criminal', 'crime', 'criminalize', 'misdemeanor', 'felony', 'infraction',
    'penalty', 'fine', 'sentence', 'jail', 'prison', 'incarceration',
    'legislation', 'bill', 'act', 'statute', 'ordinance', 'law', 'rule',
    'prohibition', 'prohibited', 'ban', 'banned', 'restriction', 'restricted',
    'tax', 'taxation', 'taxed', 'excise tax', 'sales tax',
    'dispensary', 'retail', 'retailer', 'store', 'shop',
    'cultivation', 'cultivation license', 'grow license', 'growing permit',
    'distribution', 'distributor', 'wholesale', 'wholesaler',
    'testing', 'lab testing', 'laboratory', 'quality control', 'qc',
    'packaging', 'labeling', 'label', 'warning', 'child-resistant',

    # Business/Economics
    'business', 'business owner', 'entrepreneur', 'entrepreneurship', 'startup', 'company', 'corporation',
    'revenue', 'profit', 'margin', 'profit margin', 'earnings', 'income',
    'cost', 'price', 'pricing', 'wholesale price', 'retail price', 'market price',
    'market', 'market size', 'market share', 'market analysis', 'market research',
    'valuation', 'value', 'worth', 'investment', 'invest', 'investor', 'roi', 'return on investment',
    'sales', 'revenue', 'cagr', 'growth rate', 'growth', 'expanding', 'expansion',
    'supply', 'demand', 'supply and demand', 'price elasticity', 'supply chain', 'distribution',
    'business model', 'revenue model', 'pricing strategy', 'market entry', 'entry strategy',
    'competition', 'competitive', 'competitor', 'market segment', 'target market', 'niche',
    'financial', 'finance', 'funding', 'capital', 'equity', 'debt', 'loan', 'credit',
    'partnership', 'partner', 'joint venture', 'merger', 'acquisition',
    'franchise', 'franchising', 'franchisee', 'franchisor',
    'marketing', 'advertising', 'promotion', 'brand', 'branding',
    'customer', 'client', 'consumer', 'user', 'patient',

    # Science/Research
    'science', 'scientific', 'research', 'researcher', 'researcher', 'study', 'studies',
    'clinical trial', 'trial', 'peer-reviewed', 'peer review', 'journal', 'publication',
    'methodology', 'method', 'procedure', 'protocol', 'experiment', 'experimental',
    'data', 'dataset', 'results', 'findings', 'conclusion', 'hypothesis', 'theory',
    'compound', 'molecule', 'chemical', 'chemistry', 'biochemistry',
    'extraction', 'distillation', 'chromatography', 'purification', 'refinement',
    'concentration', 'percentage', '%', 'ratio', 'proportion', 'percentage',
    'analysis', 'analyze', 'analytical', 'measurement', 'quantify', 'qualify', 'quantitative', 'qualitative',
    'statistical', 'statistics', 'statistically', 'significant', 'significance', 'p-value', 'correlation',
    'laboratory', 'lab', 'testing', 'test', 'assay', 'sample', 'specimen',
    'evidence', 'evidence-based', 'empirical', 'empirically', 'proven', 'proof',
    'biology', 'biological', 'botany', 'botanical', 'plant science', 'horticulture', 'horticultural',
    'genetics', 'genetic', 'genome', 'dna', 'rna', 'gene', 'genetic modification', 'gm',
    'pharmacology', 'pharmacological', 'pharmacokinetic', 'pharmacodynamic',
    'neuroscience', 'neurological', 'brain', 'neural', 'neuron', 'neurotransmitter',
    'endocannabinoid', 'endocannabinoid system', 'ecs', 'receptor', 'cb1', 'cb2',

    # Psychology/Mental Health
    'psychology', 'psychological', 'psychologist', 'mental health', 'mental illness',
    'depression', 'depressive', 'depressed', 'mood', 'mood disorder', 'bipolar', 'manic',
    'anxiety', 'anxious', 'panic', 'panic attack', 'phobia', 'social anxiety',
    'ptsd', 'post-traumatic stress', 'trauma', 'traumatic', 'stress', 'stressed',
    'adhd', 'attention deficit', 'hyperactivity', 'hyperactive', 'focus', 'concentration',
    'autism', 'autistic', 'autism spectrum', 'asd', 'asperger',
    'addiction', 'addictive', 'substance abuse', 'dependence', 'dependent', 'withdrawal',
    'cognitive', 'cognition', 'memory', 'recall', 'learning', 'brain function',
    'behavior', 'behavioral', 'behavioral therapy', 'cbt', 'cognitive behavioral therapy',
    'therapy', 'therapist', 'counseling', 'counselor', 'psychotherapy',
    'wellbeing', 'wellness', 'mental wellness', 'emotional', 'emotion',
    'mindfulness', 'meditation', 'relaxation', 'calm', 'peaceful',

    # General Important Terms
    'optimal', 'optimum', 'ideal', 'best', 'recommended', 'suggested',
    'effective', 'efficacy', 'efficient', 'efficiency',
    'safe', 'safety', 'risk', 'risky', 'dangerous', 'harmful',
    'benefit', 'beneficial', 'advantage', 'advantageous',
    'quality', 'high quality', 'premium', 'grade', 'grade a',
    'organic', 'organically', 'natural', 'naturally', 'synthetic', 'artificial'
]

# Synonym map for query expansion - Enhanced with comprehensive coverage
SYNONYMS = {
    # Cannabis terms
    'cannabis': ['marijuana', 'weed', 'hemp', 'pot', 'ganja'],
    'marijuana': ['cannabis', 'weed', 'hemp', 'pot', 'ganja'],
    'weed': ['cannabis', 'marijuana', 'hemp', 'pot'],
    'growing': ['cultivation', 'farming', 'planting'],
    'cultivation': ['growing', 'farming', 'planting'],
    'hydroponics': ['hydro', 'soilless', 'dwc', 'nft'],
    'hydro': ['hydroponics'],
    'lighting': ['lights', 'lamps', 'led', 'hps'],
    'nutrients': ['fertilizer', 'food', 'feeding'],
    'fertilizer': ['nutrients', 'food'],
    'soil': ['medium', 'dirt', 'substrate', 'earth'],
    'medical': ['medicinal', 'therapeutic', 'health'],
    'legal': ['law', 'legislation', 'regulation'],
    'law': ['legal', 'legislation', 'statute'],
    'business': ['industry', 'market', 'commercial'],
    'yield': ['harvest', 'production', 'output'],
    'extract': ['concentrate', 'oil', 'wax', 'shatter'],
    'thc': ['potency', 'psychoactive'],
    'cbd': ['cannabidiol', 'non-psychoactive'],
    
    # Golf terms
    'golf': ['golfing', 'links', 'course', 'green'],
    'putting': ['putt', 'putts', 'putter', 'green', 'on the green'],
    'golfing': ['golf', 'playing golf', 'course'],
    'putt': ['putting', 'putter'],
    'techniques': ['methods', 'tips', 'strategies', 'approaches', 'ways'],
    'beginner': ['novice', 'starter', 'newcomer', 'amateur'],
    
    # History/Ancient terms
    'ancient': ['old', 'antique', 'archaic', 'prehistoric', 'early', 'historical'],
    'settlers': ['colonists', 'colonizers', 'pioneers', 'immigrants', 'inhabitants'],
    'america': ['americas', 'new world', 'united states', 'usa', 'north america'],
    'columbus': ['christopher columbus', 'colombo', 'explorer', 'discoverer'],
    'discovery': ['exploration', 'finding', 'finding out', 'uncovering'],
    'history': ['historical', 'past', 'chronicle', 'record', 'timeline'],
    'historical': ['history', 'past', 'ancient', 'old'],
    
    # AI/Technology terms
    'artificial intelligence': ['ai', 'machine learning', 'ml', 'neural networks'],
    'ai': ['artificial intelligence', 'machine intelligence', 'cognitive computing'],
    'intelligence': ['ai', 'smart', 'cognitive', 'reasoning'],
    'machine learning': ['ml', 'ai', 'deep learning', 'neural networks'],
    
    # General query expansion terms
    'what': ['explain', 'describe', 'tell me about', 'information about'],
    'how': ['method', 'way', 'process', 'technique'],
    'why': ['reason', 'cause', 'explanation', 'purpose'],
    'when': ['time', 'date', 'period', 'era'],
    'where': ['location', 'place', 'site', 'area'],
    'who': ['person', 'people', 'individual', 'figure'],
}

# Measurement patterns
MEASUREMENT_PATTERNS = [
    # Growing/Cultivation
    r'ph\s*[:\-]?\s*(\d+\.?\d*)\s*[-–—to]\s*(\d+\.?\d*)',  # pH 6.5-8.0
    r'(\d+\.?\d*)\s*[-–—to]\s*(\d+\.?\d*)\s*ph',  # 6.5-8.0 pH
    r'(\d+\.?\d*)\s*ppm',  # 500 ppm
    r'(\d+\.?\d*)\s*°[cf]',  # 75°F
    r'(\d+\.?\d*)\s*percent',  # 50 percent
    r'(\d+\.?\d*)\s*%',  # 50%

    # Health/Medical
    r'(\d+\.?\d*)\s*mg',  # 10 mg
    r'(\d+\.?\d*)\s*milligram',  # 10 milligram
    r'(\d+\.?\d*)\s*mcg',  # 5 mcg
    r'(\d+\.?\d*)\s*ml',  # 2.5 ml
    r'(\d+\.?\d*)\s*mg/kg',  # 1 mg/kg
    r'(\d+\.?\d*)\s*mg\s*[-–—to]\s*(\d+\.?\d*)\s*mg',  # 5-10 mg

    # Law/Legal
    r'(\d+\.?\d*)\s*grams',  # 28 grams
    r'(\d+\.?\d*)\s*ounces',  # 1 ounce
    r'(\d+\.?\d*)\s*pounds',  # 1 pound

    # Business/Economics
    r'\$(\d+[,\d]*\.?\d*)\s*(million|billion|thousand)',  # $1.5 billion
    r'(\d+\.?\d*)\s*%?\s*(roi|margin|growth)',  # 15% ROI
    r'(\d+\.?\d*)\s*percent\s*(market|share)',  # 25% market share
]

# Language model settings (using a smaller, accessible model for CPU)
LANGUAGE_MODEL = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"  # Distilled DeepSeek reasoning model
MAX_NEW_TOKENS = 256
TEMPERATURE = 0.2
DO_SAMPLE = False

# Knowledge card preprocessing settings
KNOWLEDGE_CARD_LLM_BUDGET = 999999  # Maximum number of chunks to refine with the LLM (set high to process all chunks)

# Hybrid retrieval settings
HYBRID_SEARCH_OVERSAMPLE = 4
HYBRID_VECTOR_WEIGHT = 0.50  # Reduced to give more weight to keywords for better relevance
HYBRID_KEYWORD_WEIGHT = 0.50  # Increased significantly to prioritize keyword matches
HYBRID_MIN_KEYWORD_SCORE = 0.05  # Lowered to allow more results through
HYBRID_BM25_WEIGHT = 0.25  # Additional weight for BM25 field scoring
MAX_CHUNKS_PER_SOURCE = 2  # Limit per source to increase variety in results
MIN_CHUNK_SCORE = 0.7  # Minimum combined score threshold (raised for quality)
DEFAULT_RETRIEVAL_K = 30  # Default number of chunks to retrieve (increased for broader context)

# Enrichment settings
ENRICHMENT_DEFAULT_THREADS = 2
ENRICHMENT_MAX_THREADS = 6

# Enrichment logging settings
ENRICHMENT_STATUS_PATH = Path(PROCESSED_DIR) / "metadata" / "enrichment_status.json"
ENRICHMENT_LOG_PATH = Path(PROCESSED_DIR) / "metadata" / "enrichment_log.jsonl"

# Web interface settings
WEB_HOST = os.environ.get("RAG_HOST", "127.0.0.1")
WEB_PORT = int(os.environ.get("RAG_PORT", 5000))

# Memory system settings
MEMORY_INDEX_PATH = MEMORY_DIR / "memory_index.index"
MEMORY_METADATA_PATH = MEMORY_DIR / "memory_metadata.json"
CONVERSATION_BUFFER_SIZE = 10  # Keep last 10 messages
MEMORY_SUMMARY_INTERVAL = 10  # Summarize every 10 messages
MEMORY_RETENTION_DAYS = 30  # Keep memories for 30 days