#!/usr/bin/env python3
"""
Reprocess all PDFs with the new optimized chunking system (Phases 1-3).
This script will:
1. Clear existing vector store and chunks
2. Reprocess all PDFs with new chunking methods
3. Rebuild the vector store
"""

import json
import shutil
from pathlib import Path
from pdf_processor import PDFProcessor
from vector_store import VectorStore
from config import (
    PDF_DIR,
    PROCESSED_DIR,
    EMBEDDINGS_DIR,
    DOCUMENTS_DIR,
    METADATA_DIR,
)


def clear_existing_data():
    """Clear all existing processed data and vector store"""
    from config import NOTES_DIR, DIGESTS_DIR, CAPSULES_DIR
    
    print("🗑️  Clearing existing processed data...")
    
    # Clear vector store files
    if EMBEDDINGS_DIR.exists():
        for file in EMBEDDINGS_DIR.glob("*"):
            if file.is_file():
                file.unlink()
                print(f"  Deleted: {file.name}")
    
    # Clear processed chunks
    if DOCUMENTS_DIR.exists():
        chunk_files = list(DOCUMENTS_DIR.glob("*_chunks.json"))
        for file in chunk_files:
            file.unlink()
        print(f"  Deleted {len(chunk_files)} chunk files")
    
    # Clear enriched cards
    if NOTES_DIR.exists():
        card_files = list(NOTES_DIR.glob("*_cards.json"))
        for file in card_files:
            file.unlink()
        print(f"  Deleted {len(card_files)} card files")
    
    # Clear metadata (but keep directory structure)
    metadata_file = METADATA_DIR / "processed_files.json"
    if metadata_file.exists():
        metadata_file.unlink()
        print(f"  Deleted: processed_files.json")
    
    # Clear digests and capsules
    if DIGESTS_DIR.exists():
        digest_files = list(DIGESTS_DIR.glob("*.json"))
        for file in digest_files:
            file.unlink()
        print(f"  Deleted {len(digest_files)} digest files")
    
    if CAPSULES_DIR.exists():
        capsule_files = list(CAPSULES_DIR.glob("*.json"))
        for file in capsule_files:
            file.unlink()
        print(f"  Deleted {len(capsule_files)} capsule files")
    
    # Clear any FAISS index files
    faiss_files = list(PROCESSED_DIR.glob("*.index")) + list(PROCESSED_DIR.glob("*.pkl"))
    for file in faiss_files:
        file.unlink()
        print(f"  Deleted: {file.name}")
    
    print("✅ Existing data cleared\n")


def reprocess_all_pdfs():
    """Reprocess all PDFs with new chunking system"""
    print("📄 Processing all PDFs with new optimized chunking...")
    
    processor = PDFProcessor()
    
    # Get all PDF files
    pdf_files = list(PDF_DIR.glob("*.pdf"))
    
    if not pdf_files:
        print("⚠️  No PDF files found in pdf_directory/")
        return []
    
    print(f"Found {len(pdf_files)} PDF files\n")
    
    processed_chunks = []
    
    for i, pdf_file in enumerate(pdf_files, 1):
        print(f"[{i}/{len(pdf_files)}] Processing: {pdf_file.name}")
        try:
            chunks = processor.process_pdf(pdf_file)
            if chunks:
                processed_chunks.extend(chunks)
                print(f"  ✅ Created {len(chunks)} chunks\n")
            else:
                print(f"  ⚠️  No chunks created (may have been skipped)\n")
        except Exception as e:
            print(f"  ❌ Error processing {pdf_file.name}: {e}\n")
            import traceback
            traceback.print_exc()
    
    print(f"✅ Processed {len(pdf_files)} PDFs, total chunks: {len(processed_chunks)}")
    return processed_chunks


def rebuild_vector_store():
    """Rebuild the vector store with all processed chunks"""
    print("\n🔨 Rebuilding vector store...")
    
    # Load all chunks from processed documents
    all_chunks = []
    
    if DOCUMENTS_DIR.exists():
        chunk_files = list(DOCUMENTS_DIR.glob("*_chunks.json"))
        print(f"Found {len(chunk_files)} chunk files")
        
        for chunk_file in chunk_files:
            try:
                with open(chunk_file, 'r') as f:
                    chunks = json.load(f)
                    # Handle both old format (list of strings) and new format (list of dicts)
                    if chunks and isinstance(chunks[0], dict):
                        all_chunks.extend(chunks)
                    else:
                        # Convert old format to new format
                        for i, chunk_content in enumerate(chunks):
                            all_chunks.append({
                                'content': chunk_content,
                                'metadata': {'source': chunk_file.stem, 'chunk_id': i}
                            })
            except Exception as e:
                print(f"  ⚠️  Error loading {chunk_file.name}: {e}")
    
    if not all_chunks:
        print("⚠️  No chunks found to rebuild vector store")
        return
    
    print(f"Total chunks to index: {len(all_chunks)}")
    
    # Initialize vector store
    try:
        vector_store = VectorStore()
        
        # Add all chunks to vector store in batches
        print("Adding chunks to vector store...")
        batch_size = 100
        
        for i in range(0, len(all_chunks), batch_size):
            batch = all_chunks[i:i+batch_size]
            # Format chunks for VectorStore (expects list of dicts with 'content' and 'metadata')
            formatted_chunks = []
            for chunk_data in batch:
                content = chunk_data.get('content', '')
                metadata = chunk_data.get('metadata', {})
                if content:
                    formatted_chunks.append({
                        'content': content,
                        'metadata': metadata
                    })
            
            if formatted_chunks:
                vector_store.add_documents(formatted_chunks)
                print(f"  Progress: {min(i+batch_size, len(all_chunks))}/{len(all_chunks)} chunks indexed...")
        
        # Save the vector store
        print("Saving vector store...")
        vector_store.save_index()
        vector_store.save_metadata()
        
        print(f"✅ Vector store rebuilt with {len(all_chunks)} chunks")
        
    except Exception as e:
        print(f"❌ Error rebuilding vector store: {e}")
        import traceback
        traceback.print_exc()


def main():
    """Main reprocessing workflow"""
    print("=" * 60)
    print("🔄 REPROCESSING ALL PDFs WITH NEW OPTIMIZED CHUNKING")
    print("=" * 60)
    print()
    
    # Step 1: Clear existing data
    clear_existing_data()
    
    # Step 2: Reprocess all PDFs
    processed_chunks = reprocess_all_pdfs()
    
    if not processed_chunks:
        print("⚠️  No chunks were processed. Exiting.")
        return
    
    # Step 3: Rebuild vector store
    rebuild_vector_store()
    
    print("\n" + "=" * 60)
    print("✅ REPROCESSING COMPLETE!")
    print("=" * 60)
    print(f"\nSummary:")
    print(f"  • Total chunks created: {len(processed_chunks)}")
    print(f"  • Vector store rebuilt")
    print(f"  • Ready for use with new optimized chunking system")


if __name__ == "__main__":
    main()

