import hashlib
import json
import re
from pathlib import Path

import numpy as np
from langchain_text_splitters import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer

from config import (
    DOCUMENT_TYPE_CONFIGS,
    PDF_DIR,
    DOCUMENTS_DIR,
    METADATA_DIR,
    CHUNK_SIZE,
    CHUNK_OVERLAP,
    CHUNK_MIN_SIZE,
    CHUNK_MAX_SIZE,
    SENTENCE_OVERLAP,
    MIN_SENTENCES_PER_CHUNK,
    EMBEDDING_MODEL,
    SEMANTIC_CHUNKING_ENABLED,
    SEMANTIC_MAX_CHARS,
    SEMANTIC_SIMILARITY_THRESHOLD,
    TECHNICAL_TERMS,
    MEASUREMENT_PATTERNS,
)

# Prefer high-fidelity PDF extraction when available
try:
    import fitz  # PyMuPDF
    HAS_PYMUPDF = True
except ImportError:
    HAS_PYMUPDF = False

# Try to import better PDF libraries
try:
    from pdfminer.high_level import extract_text
    HAS_PDFMINER = True
except ImportError:
    try:
        from PyPDF2 import PdfReader
        HAS_PDFMINER = False
    except ImportError:
        HAS_PDFMINER = False

class PDFProcessor:
    def __init__(self):
        # Fallback splitter for emergency use only
        overlap_size = max(int(CHUNK_SIZE * 0.2), CHUNK_OVERLAP)
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=CHUNK_SIZE,
            chunk_overlap=overlap_size,
            length_function=len,
            separators=["\n\n", "\n", ". ", " "]
        )
        # DEPRECATED: Semantic chunking disabled
        self.semantic_chunking_enabled = False
        self.semantic_max_chars = SEMANTIC_MAX_CHARS
        self.semantic_similarity_threshold = SEMANTIC_SIMILARITY_THRESHOLD
        
        # Boilerplate patterns for filtering
        self.boilerplate_patterns = [
            r'copyright\s+©?\s*\d{4}',
            r'all rights reserved',
            r'page\s+\d+\s+of\s+\d+',
            r'confidential',
            r'draft',
            r'^\s*$',  # Empty lines
            r'^\d+$',  # Page numbers only
            r'[a-f0-9]{32,}',  # Hashes/IDs
            r'^[\s\W]*$',  # Only whitespace/punctuation
        ]
        
        # Processing statistics for monitoring
        self.processing_stats = {
            'start_time': None,
            'end_time': None,
            'chunks_created': 0,
            'chunks_filtered': 0,
            'sections_detected': 0,
            'tables_detected': 0,
            'lists_detected': 0,
            'memory_peak_mb': 0,
        }

    def get_file_hash(self, file_path):
        """Generate hash of file content to detect changes"""
        with open(file_path, 'rb') as f:
            return hashlib.md5(f.read()).hexdigest()

    def get_processed_files(self):
        """Get record of already processed files"""
        metadata_file = METADATA_DIR / "processed_files.json"
        if metadata_file.exists():
            with open(metadata_file, 'r') as f:
                return json.load(f)
        return {}

    def save_processed_file(self, file_path, file_hash, chunks):
        """Record processed file and save chunks"""
        filename = file_path.name

        # Update metadata
        metadata = self.get_processed_files()
        metadata[filename] = {
            'hash': file_hash,
            'chunk_count': len(chunks),
            'processed_date': str(file_path.stat().st_mtime)
        }

        with open(METADATA_DIR / "processed_files.json", 'w') as f:
            json.dump(metadata, f, indent=2)

        # Save document chunks
        chunk_file = DOCUMENTS_DIR / f"{Path(filename).stem}_chunks.json"
        with open(chunk_file, 'w') as f:
            json.dump(chunks, f, indent=2)

    def extract_text_from_file(self, file_path):
        """Extract text from PDF or text files using best available method"""
        if file_path.suffix.lower() == '.pdf':
            # Prefer PyMuPDF for layout-aware extraction
            if HAS_PYMUPDF:
                try:
                    return self._extract_with_pymupdf(file_path)
                except Exception as e:
                    print(f"Warning: PyMuPDF extraction failed for {file_path}: {e}")

            try:
                if HAS_PDFMINER:
                    # Use pdfminer for better text extraction
                    text = extract_text(str(file_path))
                    return text
                else:
                    # Fallback to PyPDF2
                    from PyPDF2 import PdfReader
                    reader = PdfReader(file_path)
                    text = ""
                    for page in reader.pages:
                        text += page.extract_text() + "\n"
                    return text
            except Exception as e:
                print(f"Warning: PDF extraction failed for {file_path}: {e}")
                # Try fallback method
                try:
                    from PyPDF2 import PdfReader
                    reader = PdfReader(file_path)
                    text = ""
                    for page in reader.pages:
                        text += page.extract_text() + "\n"
                    return text
                except Exception as e2:
                    print(f"Error: Both PDF extraction methods failed for {file_path}: {e2}")
                    # Return empty text so upstream (airagagent/app.py) can trigger OCR fallback
                    # instead of chunking an error message.
                    return ""
        elif file_path.suffix.lower() == '.txt':
            try:
                with open(file_path, 'r', encoding='utf-8') as f:
                    return f.read()
            except UnicodeDecodeError:
                # Try with latin-1 encoding for older files
                try:
                    with open(file_path, 'r', encoding='latin-1') as f:
                        return f.read()
                except Exception as e:
                    print(f"Warning: Failed to read text file {file_path} with latin-1 encoding: {e}")
                    raise
            except Exception as e:
                print(f"Error reading text file {file_path}: {e}")
                raise
        else:
            raise ValueError(f"Unsupported file type: {file_path.suffix}")

    def _extract_with_pymupdf(self, file_path: Path) -> str:
        """High-fidelity extraction using PyMuPDF with basic structural cues."""
        doc = fitz.open(file_path)
        pages = []
        try:
            for page in doc:
                text = page.get_text(
                    "text",
                    flags=fitz.TEXT_DEHYPHENATE | fitz.TEXT_PRESERVE_WHITESPACE
                )
                pages.append(text)
        finally:
            doc.close()

        return "\n".join(pages)

    def _detect_document_type(self, text: str, filename: str) -> str:
        """
        Detect document type based on content and filename.
        Returns: 'technical', 'research', 'legal', 'manual', or 'default'
        """
        text_lower = text.lower()
        filename_lower = filename.lower()
        
        # Technical document indicators
        technical_indicators = [
            'specification', 'technical', 'api', 'protocol', 'standard',
            'measurement', 'calibration', 'configuration', 'parameter',
            'ph', 'ppm', 'temperature', 'voltage', 'current'
        ]
        
        # Research document indicators
        research_indicators = [
            'abstract', 'introduction', 'methodology', 'results', 'conclusion',
            'references', 'citation', 'study', 'research', 'experiment',
            'hypothesis', 'peer-reviewed', 'journal'
        ]
        
        # Legal document indicators
        legal_indicators = [
            'whereas', 'hereby', 'pursuant', 'statute', 'regulation',
            'compliance', 'legal', 'law', 'act', 'ordinance', 'clause',
            'section', 'subsection', 'article', 'paragraph'
        ]
        
        # Manual/guide indicators
        manual_indicators = [
            'manual', 'guide', 'tutorial', 'how to', 'step', 'procedure',
            'instruction', 'user', 'getting started', 'quick start'
        ]
        
        # Score each type
        scores = {
            'technical': sum(1 for ind in technical_indicators if ind in text_lower or ind in filename_lower),
            'research': sum(1 for ind in research_indicators if ind in text_lower or ind in filename_lower),
            'legal': sum(1 for ind in legal_indicators if ind in text_lower or ind in filename_lower),
            'manual': sum(1 for ind in manual_indicators if ind in text_lower or ind in filename_lower),
        }
        
        # Return type with highest score, or default if no clear match
        max_score = max(scores.values())
        if max_score >= 2:  # Need at least 2 indicators
            return max(scores, key=scores.get)
        
        return 'default'

    def _get_chunking_config(self, document_type: str) -> dict:
        """Get chunking configuration for a specific document type."""
        return DOCUMENT_TYPE_CONFIGS.get(document_type, DOCUMENT_TYPE_CONFIGS['default'])

    def process_pdf(self, file_path):
        """Process a single PDF file with Phase 3 enhancements"""
        import time
        import os
        
        # Try to import psutil for memory monitoring (optional)
        try:
            import psutil
            HAS_PSUTIL = True
        except ImportError:
            HAS_PSUTIL = False
        
        filename = file_path.name
        file_hash = self.get_file_hash(file_path)
        processed_files = self.get_processed_files()

        # Check if file needs processing
        if filename in processed_files:
            if processed_files[filename]['hash'] == file_hash:
                print(f"✓ {filename} already processed, skipping...")
                return None

        print(f"Processing {filename}...")

        # Initialize processing statistics
        if HAS_PSUTIL:
            process = psutil.Process(os.getpid())
        self.processing_stats = {
            'start_time': time.time(),
            'end_time': None,
            'chunks_created': 0,
            'chunks_filtered': 0,
            'sections_detected': 0,
            'tables_detected': 0,
            'lists_detected': 0,
            'memory_peak_mb': 0,
            'document_type': 'default',
        }

        # Extract text
        text = self.extract_text_from_file(file_path)
        document_title = self._extract_document_title(text, filename)
        
        # Detect document type (Phase 3.1)
        document_type = self._detect_document_type(text, filename)
        self.processing_stats['document_type'] = document_type
        print(f"  Detected document type: {document_type}")
        
        # Get document-type-specific config
        doc_config = self._get_chunking_config(document_type)
        
        # Use structure-aware chunking with document-type-specific parameters
        chunks = self._structure_aware_chunk_text(text, doc_config)
        chunk_method = f"structure_aware_{document_type}"
        self.processing_stats['chunks_created'] = len(chunks)
        
        # Fallback to simple sentence-based if structure-aware fails
        if not chunks:
            print(f"Warning: Structure-aware chunking failed for {filename}, using sentence-based")
            chunks = self._sentence_based_chunk_text(text, "", 0, doc_config)
            chunk_method = f"sentence_based_{document_type}"
        
        # Fallback to recursive if sentence chunking fails
        if not chunks:
            print(f"Warning: Sentence chunking failed for {filename}, using recursive fallback")
            chunks = self.text_splitter.split_text(text)
            chunk_method = "recursive_fallback"

        # Filter out boilerplate and low-quality chunks
        filtered_chunks = []
        for i, chunk in enumerate(chunks):
            if self._is_valid_chunk(chunk, doc_config):
                filtered_chunks.append(chunk)
            else:
                self.processing_stats['chunks_filtered'] += 1
                print(f"  Filtered out chunk {i} (boilerplate/low quality)")

        # Add clean metadata to chunks
        chunk_data = []
        for i, chunk in enumerate(filtered_chunks):
            # Try to detect section from chunk content (heuristic)
            section_title = ""
            section_level = 0
            # Look for heading-like patterns at the start of chunk
            lines = chunk.split('\n')
            if lines and len(lines[0].strip()) < 80:
                first_line = lines[0].strip()
                # Check if first line looks like a heading
                if first_line.isupper() and len(first_line) >= 4:
                    section_title = first_line
                    section_level = 1
                elif re.match(r'^\d+\.\s+', first_line):
                    section_title = re.sub(r'^\d+\.\s+', '', first_line)
                    section_level = 1
            
            metadata = self._extract_chunk_metadata(
                chunk, filename, i, len(filtered_chunks), document_title, chunk_method,
                section_title, section_level
            )
            chunk_data.append({
                'content': chunk,
                'metadata': metadata
            })

        # Update processing statistics
        self.processing_stats['end_time'] = time.time()
        if HAS_PSUTIL:
            self.processing_stats['memory_peak_mb'] = process.memory_info().rss / 1024 / 1024
        else:
            # Fallback: use basic memory info if available
            try:
                import resource
                self.processing_stats['memory_peak_mb'] = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024
            except:
                self.processing_stats['memory_peak_mb'] = 0
        
        # Calculate chunk size distribution for evaluation
        chunk_sizes = [len(chunk['content']) for chunk in chunk_data]
        if chunk_sizes:
            self.processing_stats['chunk_size_avg'] = sum(chunk_sizes) / len(chunk_sizes)
            self.processing_stats['chunk_size_min'] = min(chunk_sizes)
            self.processing_stats['chunk_size_max'] = max(chunk_sizes)
            self.processing_stats['chunk_size_std'] = (sum((x - self.processing_stats['chunk_size_avg'])**2 for x in chunk_sizes) / len(chunk_sizes))**0.5

        self.save_processed_file(file_path, file_hash, chunk_data)
        
        # Print processing summary
        duration = self.processing_stats['end_time'] - self.processing_stats['start_time']
        print(f"✓ Processed {filename} into {len(chunk_data)} chunks (filtered from {len(chunks)})")
        print(f"  Type: {document_type}, Time: {duration:.2f}s, Memory: {self.processing_stats['memory_peak_mb']:.1f}MB")
        print(f"  Chunk sizes: avg={self.processing_stats.get('chunk_size_avg', 0):.0f}, "
              f"min={self.processing_stats.get('chunk_size_min', 0)}, "
              f"max={self.processing_stats.get('chunk_size_max', 0)}")
        
        return chunk_data

    def get_new_files(self):
        """Get list of files (PDFs and text files) that need processing"""
        supported_extensions = ['*.pdf', '*.txt']
        files = []
        for ext in supported_extensions:
            files.extend(list(PDF_DIR.glob(ext)))

        processed_files = self.get_processed_files()
        new_files = []

        for file_path in files:
            if file_path.name not in processed_files:
                new_files.append(file_path)
            else:
                # Check if file has been modified
                current_hash = self.get_file_hash(file_path)
                if processed_files[file_path.name]['hash'] != current_hash:
                    new_files.append(file_path)

        return new_files

    def process_all_new(self):
        """Process all new or modified files"""
        new_files = self.get_new_files()
        all_chunks = []

        if not new_files:
            print("No new files to process.")
            return []

        print(f"Found {len(new_files)} new/modified files to process...")

        for file_path in new_files:
            chunks = self.process_pdf(file_path)
            if chunks:
                all_chunks.extend(chunks)

        return all_chunks

    def _extract_document_title(self, text: str, fallback: str) -> str:
        """Derive a human friendly title from the extracted text."""
        if not text:
            return Path(fallback).stem

        for line in text.splitlines():
            line = line.strip()
            if not line:
                continue
            if len(line) < 6 or len(line.split()) == 1:
                continue
            if any(token in line.lower() for token in ['library', 'due date', 'copyright', 'isbn', 'www', 'email']):
                continue
            if sum(c.isdigit() for c in line) > 3:
                continue
            if line.upper() == line and sum(c.isalpha() for c in line) > 0:
                # Likely a shouted header; fallback instead
                continue
            cleaned = re.sub(r'[^A-Za-z0-9\s\'\-:,]', '', line)
            if len(cleaned.split()) < 3:
                continue
            return cleaned[:200]
        return Path(fallback).stem.replace('_', ' ')

    def _semantic_chunk_text(self, text: str):
        """Split text into semantically coherent chunks using paragraph embeddings."""
        if not self.embedding_model:
            return None

        paragraphs = [para.strip() for para in re.split(r'\n{2,}', text) if len(para.strip()) > 0]
        if len(paragraphs) < 4:
            return None

        try:
            embeddings = self.embedding_model.encode(
                paragraphs,
                batch_size=32,
                convert_to_numpy=True,
                normalize_embeddings=True
            )
        except Exception as exc:
            print(f"Warning: semantic chunk embedding failed ({exc}).")
            return None

        chunks = []
        current = [paragraphs[0]]
        current_len = len(paragraphs[0])

        for idx in range(1, len(paragraphs)):
            similarity = float(np.dot(embeddings[idx], embeddings[idx - 1]))
            candidate_len = current_len + len(paragraphs[idx]) + 2

            if similarity < self.semantic_similarity_threshold or candidate_len > self.semantic_max_chars:
                chunk_text = '\n\n'.join(current).strip()
                if chunk_text:
                    chunks.append(chunk_text)
                current = [paragraphs[idx]]
                current_len = len(paragraphs[idx])
            else:
                current.append(paragraphs[idx])
                current_len = candidate_len

        if current:
            chunk_text = '\n\n'.join(current).strip()
            if chunk_text:
                chunks.append(chunk_text)

        chunks = self._merge_small_chunks(chunks)

        if len(chunks) <= 1:
            return None

        return chunks

    def _detect_headings(self, text: str) -> list:
        """
        Detect headings and sections in the document.
        Returns list of tuples: (heading_text, position, level)
        Level: 1 = major section, 2 = subsection, 3 = sub-subsection
        """
        headings = []
        lines = text.split('\n')
        
        # Heading patterns
        patterns = [
            # ALL CAPS headings (level 1)
            (r'^\s*[A-Z][A-Z\s]{3,}\s*$', 1),
            # Numbered headings: 1. Introduction, 1.1 Section, etc. (level 1 or 2)
            (r'^\s*(\d+)\.\s+([A-Z][^\n]+)$', None),  # Will determine level by number depth
            # Title Case headings (level 2)
            (r'^\s*([A-Z][a-z]+(?:\s+[A-Z][a-z]+){1,})\s*$', 2),
            # Markdown-style headings: # Heading, ## Subheading (level 1-3)
            (r'^\s*(#{1,3})\s+(.+)$', None),  # Will determine level by # count
        ]
        
        for i, line in enumerate(lines):
            line_stripped = line.strip()
            if not line_stripped or len(line_stripped) < 3:
                continue
            
            # Check ALL CAPS (must be at least 4 chars, mostly uppercase)
            if line_stripped.isupper() and len(line_stripped) >= 4:
                headings.append((line_stripped, i, 1))
                continue
            
            # Check numbered headings
            numbered_match = re.match(r'^\s*(\d+)(?:\.(\d+))?(?:\.(\d+))?\s+(.+)$', line_stripped)
            if numbered_match:
                depth = sum(1 for g in numbered_match.groups()[:3] if g)
                level = min(depth, 3)
                heading_text = numbered_match.group(4)
                if len(heading_text) > 3:
                    headings.append((heading_text, i, level))
                continue
            
            # Check markdown-style headings
            markdown_match = re.match(r'^\s*(#{1,3})\s+(.+)$', line_stripped)
            if markdown_match:
                level = len(markdown_match.group(1))
                heading_text = markdown_match.group(2)
                if len(heading_text) > 3:
                    headings.append((heading_text, i, level))
                continue
            
            # Check Title Case (must have at least 2 capitalized words)
            if re.match(r'^[A-Z][a-z]+(?:\s+[A-Z][a-z]+)+$', line_stripped):
                # Make sure it's not a regular sentence (too long or ends with punctuation)
                if len(line_stripped) < 80 and not line_stripped.endswith(('.', '!', '?', ':')):
                    headings.append((line_stripped, i, 2))
        
        return headings

    def _split_into_sections(self, text: str, headings: list) -> list:
        """
        Split text into sections based on detected headings.
        Returns list of tuples: (section_title, section_content, level)
        """
        if not headings:
            return [("", text, 0)]  # No headings, treat as single section
        
        sections = []
        lines = text.split('\n')
        
        # Add document start as first section if needed
        if headings[0][1] > 0:
            start_content = '\n'.join(lines[:headings[0][1]]).strip()
            if start_content:
                sections.append(("Introduction", start_content, 0))
        
        # Process sections between headings
        for i in range(len(headings)):
            heading_text, heading_pos, heading_level = headings[i]
            
            # Determine end position (next heading or end of document)
            end_pos = headings[i+1][1] if i+1 < len(headings) else len(lines)
            
            # Extract section content
            section_content = '\n'.join(lines[heading_pos+1:end_pos]).strip()
            
            if section_content:
                sections.append((heading_text, section_content, heading_level))
        
        return sections

    def _detect_tables_and_lists(self, text: str) -> list:
        """
        Detect tables and lists in text.
        Returns list of tuples: (content, type, start_pos, end_pos)
        type: 'table', 'list', 'code'
        """
        detected = []
        lines = text.split('\n')
        
        i = 0
        while i < len(lines):
            line = lines[i].strip()
            
            # Detect tables (multiple columns separated by spaces/tabs, consistent structure)
            if i < len(lines) - 2:
                # Check if this looks like a table header
                if re.match(r'^[\w\s]+\|[\w\s]+', line) or re.match(r'^[\w\s]+\t+[\w\s]+', line):
                    # Look for more table rows
                    table_lines = [line]
                    j = i + 1
                    while j < len(lines) and j < i + 50:  # Max 50 rows
                        next_line = lines[j].strip()
                        # Check if it continues the table pattern
                        if re.match(r'^[\w\s\-\.]+\|[\w\s\-\.]+', next_line) or \
                           re.match(r'^[\w\s\-\.]+\t+[\w\s\-\.]+', next_line) or \
                           re.match(r'^[\-=\s]+$', next_line):  # Separator line
                            table_lines.append(next_line)
                            j += 1
                        else:
                            break
                    
                    if len(table_lines) >= 2:
                        table_content = '\n'.join(table_lines)
                        detected.append((table_content, 'table', i, j))
                        i = j
                        continue
            
            # Detect lists (bullet points, numbered lists)
            if re.match(r'^[\s]*[•\-\*\+]\s+', line) or re.match(r'^[\s]*\d+[\.\)]\s+', line):
                list_lines = [line]
                j = i + 1
                while j < len(lines) and j < i + 100:  # Max 100 items
                    next_line = lines[j].strip()
                    # Continue if it's another list item or indented continuation
                    if re.match(r'^[\s]*[•\-\*\+]\s+', next_line) or \
                       re.match(r'^[\s]*\d+[\.\)]\s+', next_line) or \
                       (next_line and len(next_line) > 0 and lines[j].startswith(' ')):
                        list_lines.append(next_line)
                        j += 1
                    elif not next_line:  # Empty line might continue list
                        j += 1
                    else:
                        break
                
                if len(list_lines) >= 2:
                    list_content = '\n'.join(list_lines)
                    detected.append((list_content, 'list', i, j))
                    i = j
                    continue
            
            i += 1
        
        return detected

    def _split_into_sentences(self, text: str) -> list:
        """Split text into sentences using regex-based approach."""
        if not text:
            return []
        
        # Enhanced sentence splitting regex
        # Handles: . ! ? followed by space and capital letter or end of text
        # Also handles: ." .' .) etc.
        sentence_endings = r'(?<=[.!?])\s+(?=[A-Z])|(?<=[.!?])\s*$|(?<=[.!?"\'])\s+(?=[A-Z])'
        sentences = re.split(sentence_endings, text)
        
        # Clean and filter sentences
        cleaned = []
        for sent in sentences:
            sent = sent.strip()
            if len(sent) > 10:  # Minimum sentence length
                cleaned.append(sent)
        
        return cleaned

    def _sentence_based_chunk_text(self, text: str, section_title: str = "", section_level: int = 0, config: dict = None) -> list:
        """
        Split text into chunks using sentence-based sliding window.
        This is the PRIMARY chunking method - deterministic and reliable.
        Now supports section-aware chunking and document-type-specific parameters.
        """
        # Use document-type-specific config or defaults
        if config is None:
            config = DOCUMENT_TYPE_CONFIGS['default']
        
        chunk_size = config.get('chunk_size', CHUNK_SIZE)
        chunk_min_size = config.get('chunk_min_size', CHUNK_MIN_SIZE)
        chunk_max_size = config.get('chunk_max_size', CHUNK_MAX_SIZE)
        sentence_overlap = config.get('sentence_overlap', SENTENCE_OVERLAP)
        min_sentences = config.get('min_sentences', MIN_SENTENCES_PER_CHUNK)
        
        if not text or len(text.strip()) < chunk_min_size:
            return []
        
        # Split into sentences
        sentences = self._split_into_sentences(text)
        
        if len(sentences) < min_sentences:
            return []  # Not enough sentences, fallback will handle
        
        chunks = []
        current_chunk = []
        current_length = 0
        overlap_sentences = []
        
        for sentence in sentences:
            sentence_length = len(sentence) + 1  # +1 for space
            
            # Check if adding this sentence would exceed max size
            if current_length + sentence_length > chunk_max_size and current_chunk:
                # Save current chunk
                chunk_text = ' '.join(current_chunk).strip()
                if len(chunk_text) >= chunk_min_size:
                    chunks.append(chunk_text)
                
                # Start new chunk with overlap
                overlap_sentences = current_chunk[-sentence_overlap:] if len(current_chunk) >= sentence_overlap else current_chunk
                current_chunk = overlap_sentences.copy()
                current_length = sum(len(s) + 1 for s in current_chunk)
            
            # Add sentence to current chunk
            current_chunk.append(sentence)
            current_length += sentence_length
            
            # If we've reached target size, consider creating chunk
            if current_length >= chunk_size and len(current_chunk) >= min_sentences:
                chunk_text = ' '.join(current_chunk).strip()
                if len(chunk_text) >= chunk_min_size:
                    chunks.append(chunk_text)
                    # Start new chunk with overlap
                    overlap_sentences = current_chunk[-sentence_overlap:] if len(current_chunk) >= sentence_overlap else []
                    current_chunk = overlap_sentences.copy()
                    current_length = sum(len(s) + 1 for s in current_chunk)
        
        # Add final chunk if it meets requirements
        if current_chunk:
            chunk_text = ' '.join(current_chunk).strip()
            if len(chunk_text) >= chunk_min_size:
                chunks.append(chunk_text)
        
        return chunks if len(chunks) > 0 else []

    def _structure_aware_chunk_text(self, text: str, config: dict = None) -> list:
        """
        Structure-aware chunking: detect sections, process hierarchically.
        This is the ENHANCED chunking method that respects document structure.
        Now supports document-type-specific parameters.
        """
        # Use document-type-specific config or defaults
        if config is None:
            config = DOCUMENT_TYPE_CONFIGS['default']
        
        chunk_min_size = config.get('chunk_min_size', CHUNK_MIN_SIZE)
        chunk_max_size = config.get('chunk_max_size', CHUNK_MAX_SIZE)
        
        if not text or len(text.strip()) < chunk_min_size:
            return []
        
        # Step 1: Detect headings and sections
        headings = self._detect_headings(text)
        self.processing_stats['sections_detected'] = len(headings)
        
        # Step 2: Split into sections
        sections = self._split_into_sections(text, headings)
        
        # Step 3: Process each section independently
        all_chunks = []
        
        for section_title, section_content, section_level in sections:
            if not section_content or len(section_content.strip()) < chunk_min_size:
                continue
            
            # Step 3a: Detect and handle tables/lists separately
            tables_lists = self._detect_tables_and_lists(section_content)
            
            # Update statistics
            for _, content_type, _, _ in tables_lists:
                if content_type == 'table':
                    self.processing_stats['tables_detected'] += 1
                elif content_type == 'list':
                    self.processing_stats['lists_detected'] += 1
            
            if tables_lists:
                # Process tables/lists as atomic chunks
                processed_content = section_content
                for table_content, content_type, start_pos, end_pos in reversed(tables_lists):
                    # Replace table/list with placeholder, process separately
                    lines = processed_content.split('\n')
                    # Extract and save table/list as chunk
                    table_lines = lines[start_pos:end_pos]
                    table_text = '\n'.join(table_lines)
                    
                    # Tables/lists are kept as single chunks if they meet size requirements
                    if len(table_text) >= chunk_min_size and len(table_text) <= chunk_max_size:
                        all_chunks.append(table_text)
                    
                    # Remove from section content for further processing
                    processed_content = '\n'.join(lines[:start_pos] + lines[end_pos:])
                
                # Process remaining section content
                if processed_content.strip() and len(processed_content.strip()) >= chunk_min_size:
                    section_chunks = self._sentence_based_chunk_text(processed_content, section_title, section_level, config)
                    all_chunks.extend(section_chunks)
            else:
                # No tables/lists, process section normally
                section_chunks = self._sentence_based_chunk_text(section_content, section_title, section_level, config)
                all_chunks.extend(section_chunks)
        
        # If structure detection didn't work well, fall back to simple sentence chunking
        if len(all_chunks) == 0:
            return self._sentence_based_chunk_text(text, "", 0, config)
        
        return all_chunks

    def _is_valid_chunk(self, chunk: str, config: dict = None) -> bool:
        """
        Validate chunk quality - reject boilerplate and low-quality chunks.
        Enhanced quality gates for Phase 2 & 3.
        Returns True if chunk should be kept, False if it should be filtered out.
        """
        # Use document-type-specific config or defaults
        if config is None:
            config = DOCUMENT_TYPE_CONFIGS['default']
        
        chunk_min_size = config.get('chunk_min_size', CHUNK_MIN_SIZE)
        min_sentences = config.get('min_sentences', MIN_SENTENCES_PER_CHUNK)
        
        if not chunk or len(chunk.strip()) < chunk_min_size:
            return False
        
        # Check for table/list content - these are valid even if they don't meet sentence requirements
        is_table = bool(re.search(r'\|', chunk))
        is_list = bool(re.search(r'^\s*[•\-\*\+]\s+', chunk, re.MULTILINE))
        if is_table or is_list:
            # Tables/lists are valid if they have reasonable content
            return len(chunk.strip()) >= chunk_min_size
        
        # Check minimum sentence count (for regular text)
        sentences = self._split_into_sentences(chunk)
        if len(sentences) < min_sentences:
            return False
        
        # Check for boilerplate patterns
        chunk_lower = chunk.lower()
        for pattern in self.boilerplate_patterns:
            if re.search(pattern, chunk_lower, re.IGNORECASE):
                return False
        
        # Check for excessive numeric/symbol content (>80%)
        alphanumeric_chars = sum(1 for c in chunk if c.isalnum() or c.isspace())
        if len(chunk) > 0 and alphanumeric_chars / len(chunk) < 0.2:
            return False
        
        # Enhanced semantic density check (unique words ratio)
        words = re.findall(r'\b\w+\b', chunk_lower)
        if len(words) > 0:
            unique_words = len(set(words))
            unique_ratio = unique_words / len(words)
            if unique_ratio < 0.3:  # Too repetitive
                return False
            
            # Additional check: ensure we have meaningful content (not just stop words)
            stop_words = {'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by', 'is', 'are', 'was', 'were', 'be', 'been', 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'should', 'could', 'may', 'might', 'must', 'can', 'this', 'that', 'these', 'those', 'i', 'you', 'he', 'she', 'it', 'we', 'they'}
            meaningful_words = [w for w in words if w not in stop_words]
            if len(meaningful_words) < 5:  # Need at least 5 meaningful words
                return False
        
        return True

    def _merge_small_chunks(self, chunks, min_chars: int = 280):
        """Avoid creating very small dangling chunks."""
        if not chunks:
            return chunks

        merged = []
        for chunk in chunks:
            if merged and len(chunk) < min_chars:
                merged[-1] = merged[-1] + "\n\n" + chunk
            else:
                merged.append(chunk)

        return merged

    def _extract_chunk_metadata(self, chunk_content: str, source: str, chunk_id: int, 
                                total_chunks: int, document_title: str, chunk_method: str,
                                section_title: str = "", section_level: int = 0) -> dict:
        """
        Extract clean, actionable metadata from chunk content.
        Enhanced with section information for Phase 2.
        """
        content_lower = chunk_content.lower()
        
        # Basic structural info
        sentences = self._split_into_sentences(chunk_content)
        words = re.findall(r'\b\w+\b', chunk_content)
        
        # Extract page number if available
        page_number = None
        page_match = re.search(r'(?:page|p\.?)\s*(\d+)', content_lower[:200], re.IGNORECASE)
        if page_match:
            try:
                page_number = int(page_match.group(1))
            except:
                pass
        
        # Detect content type
        is_table = bool(re.search(r'\|', chunk_content))
        is_list = bool(re.search(r'^\s*[•\-\*\+]\s+', chunk_content, re.MULTILINE))
        content_type = 'table' if is_table else ('list' if is_list else 'text')
        
        # Content quality indicators (simplified)
        has_technical_content = any(term in content_lower for term in ['ph', 'ppm', 'temperature', 'measurement', 'data', 'study', 'research'])
        is_boilerplate = any(re.search(pattern, content_lower, re.IGNORECASE) for pattern in self.boilerplate_patterns)
        
        # Readability score (simple heuristic: sentence length variance)
        if sentences:
            avg_sentence_length = sum(len(s.split()) for s in sentences) / len(sentences)
            readability_score = min(1.0, max(0.0, 1.0 - abs(avg_sentence_length - 20) / 20))
        else:
            readability_score = 0.5
        
        # Clean excerpt
        clean_excerpt = re.sub(r'\s+', ' ', chunk_content[:200]).strip()
        
        return {
            'source': source,
            'chunk_id': chunk_id,
            'total_chunks': total_chunks,
            'document_title': document_title,
            'chunk_method': chunk_method,
            'page': page_number,
            'section': section_title if section_title else None,
            'section_level': section_level if section_title else None,
            'structural_info': {
                'sentence_count': len(sentences),
                'word_count': len(words),
                'char_count': len(chunk_content),
                'content_type': content_type,
            },
            'content_quality': {
                'readability_score': round(readability_score, 2),
                'has_technical_content': has_technical_content,
                'is_boilerplate': is_boilerplate,
            },
            'clean_excerpt': clean_excerpt
        }
